Compare commits

...
Sign in to create a new pull request.

475 commits

Author SHA1 Message Date
727ca1ee76 fixed the duplicate colum problem by removing them from combining_dfs.py 2021-11-24 07:57:20 +00:00
6550be3350 added info re having run mcsm_na for RNAP 2021-11-19 07:51:13 +00:00
7fd5e2710d ran mcsm_na for rpob's RNAP complex i.e 5UHC 2021-11-19 07:48:42 +00:00
69b8ba9d08 ran mcsm format for embb 2021-11-13 09:43:56 +00:00
45c48485f1 saving work after running combining_dfs.py 2021-11-12 14:16:48 +00:00
1ddc5045d5 added TESTING_plots.R 2021-11-09 13:55:21 +00:00
34ee2519d3 added FIXME and TODO related to alr in combinig_dfs.py 2021-11-09 13:23:50 +00:00
246cd636a1 saving work in LSHTM_analysis before combining data for targets 2021-11-09 12:44:11 +00:00
80f73a3697 cherry-pick mcsm_na/run_format_results_mcsm_na.py from master to ensure consistency 2021-10-28 12:54:04 +01:00
7d6087c82e saving ppi2 format script on embb_dev branch 2021-10-28 12:22:46 +01:00
9c37dbee31 bring in embb stuff which was in the wrong branch 2021-10-28 11:18:13 +01:00
1e3670f935 added log10 OR and P values to myaf_or_calcs.R 2021-08-23 20:01:01 +01:00
0c16937b68 added corr plots as function for interactive graphs on shiny 2021-08-20 18:52:47 +01:00
c0c30fd527 adde format_results_dynamut2.py and ran shiny scripts for barplots 2021-08-19 16:25:38 +01:00
9cb33ed67b added pdb_fasta_plot.R for generating some useful plots for shiny 2021-08-17 10:55:06 +01:00
067fc85163 extracting gid seq from pdb file using pdbtools 2021-08-17 10:53:26 +01:00
9b1d1d009d added aa_index/ with script that return dfs for plots for shiny perhaps 2021-08-13 16:22:11 +01:00
1ea42097ae added dynamut results formatting scripts, althouh needs to be rerun once b7 completes 2021-08-13 13:24:22 +01:00
2e9d142184 indicated f for format for mcms_na formatting script 2021-08-13 13:23:42 +01:00
2eee69ee80 saving work 2021-08-12 17:37:56 +01:00
938dba7fcc extracted results for dynamut gid bissection b10_21 2021-08-12 17:35:12 +01:00
59a370b45a Merge branch 'master' into gidb_dev 2021-08-12 15:35:28 +01:00
3086972480 minor tidy up for script submit_dynamut 2021-08-12 15:33:57 +01:00
a641347f63 reran b7 since previous run file output was 0 bytes 2021-08-12 15:29:36 +01:00
e48f215227 ran b9 and b10 for gid after Dynamut team reran due to server issues 2021-08-12 10:06:43 +01:00
96277d78f6 saving dynamut and mcsm_na jobs submitted and retrieved 2021-08-11 17:32:15 +01:00
7c0824d0f2 added script for formatting mcsm_na results 2021-08-06 19:12:57 +01:00
656639e871 ran submit and get_results for one last batch for mcsm_na and did some bash formatting to get proper filenames, etc. 2021-08-06 19:09:29 +01:00
718f92d7ff resuming work after conference 2021-08-05 16:54:34 +01:00
6759649c61 indicated which cols are not available for pnca as I ran these scripts for generating plots for the poster 2021-07-07 13:12:29 +01:00
5eb07cdf86 added leg_title size for bp function 2021-07-07 13:11:13 +01:00
4bf4650c88 generated pncA plot for poster for ps_combined 2021-07-07 11:38:07 +01:00
a6f0832a42 reran plots with current lig dist 2021-06-30 17:35:57 +01:00
b679068a5e added the almost done shiny for barplots subcolours 2021-06-30 17:20:04 +01:00
c599d28377 renamed barplot_colour_function.R to bp_subcolours.R and reflected it in scripts using it. 2021-06-29 14:05:48 +01:00
9f5b983bc0 added barplots_subcolours.R that generates heatmap style barplots 2021-06-29 14:00:10 +01:00
89e6b03673 moved subcols script to redundant 2021-06-29 13:59:38 +01:00
a9f9cec494 moved barplot_colour_function.R to functions 2021-06-29 13:58:22 +01:00
29d9717abe updated running_plotting_scripts.txt with corr_plots.R 2021-06-28 17:30:25 +01:00
20976c31bb moved corr_data and corr_PS_LIG.R to redundant 2021-06-28 17:29:31 +01:00
0f983d2889 added corr_plots.R to generate corr plots by adding source data in get_plotting_dfs.R and tested with cmd 2021-06-28 17:27:50 +01:00
b614962e45 added corr data to get_plotting_dfs.R and generate corr plots 2021-06-28 17:25:45 +01:00
639ccf1cd7 moved old logo plots scripts to redundant and updated running_plotting_scripts.txt to reflect these and how to run the single logo_plots.R to generate logo plots 2021-06-24 17:45:40 +01:00
f1a8fb583a added logo_plots.R that now produces all logo plots while sourcing the get_plotting_df.R script 2021-06-24 17:34:53 +01:00
e75cfd2665 checked logo_multiple_muts.R with the new sourcing script for data 2021-06-24 16:43:23 +01:00
71d874e350 added get_plotting_dfs.R as a mother script to be sourced by all plotting scripts 2021-06-24 14:21:34 +01:00
5eba273a55 made logo_plot.R source script that pull in all the data 2021-06-24 14:19:46 +01:00
506e639a7b moved my_pairs_panel.R to functions/ 2021-06-24 12:13:15 +01:00
762b1a3931 fixed cmd running script problem for logo plots 2021-06-24 12:12:36 +01:00
e822f9f690 added first line to all func to run from 2021-06-24 10:02:14 +01:00
1c27bbff11 saving work on logo plots before finishing 2021-06-23 16:49:18 +01:00
8f4daba98d generated logo_plot.R from cmd, checked 2021-06-23 16:35:44 +01:00
7e6affea84 added test_plotting_data.R, and replaced input param of csv into df 2021-06-23 16:16:23 +01:00
2aec79af31 changes made to combining_dfs_plotting.R 2021-06-23 16:15:15 +01:00
c6d1260f74 updated logo_plot.R with functions 2021-06-23 12:06:41 +01:00
13c61e7813 moved combining_dfs_plotting.R to function and added test script for this as well 2021-06-22 18:15:15 +01:00
ac383165ec added files that were moved to redundant 2021-06-22 18:06:08 +01:00
04a7cf15dc turned combining_dfs_plotting.R to a function and moved old script to redundant 2021-06-22 18:04:10 +01:00
e10ab6a7c6 updating script to sort out proper merging for plotting 2021-06-22 14:46:03 +01:00
064182d784 took extra lines from data extraction 2021-06-21 16:15:44 +01:00
920007cc83 added af_or to add to combining_dfs.py 2021-06-21 14:53:04 +01:00
8a301e8bb1 added deep ddg formatted data to combinig_dfs.py 2021-06-21 12:56:06 +01:00
9534fc57d4 added deepddg data to combining_df.py 2021-06-21 11:53:56 +01:00
f79aea254e added function to add aa code for mcsm and gwas style mutations to a given files 2021-06-18 17:48:26 +01:00
f6a2e029cb saving work before adding files 2021-06-18 17:47:09 +01:00
86ed1805fc Merge branch 'gidb_dev' 2021-06-14 13:27:00 +01:00
ddb1a7a7aa added aa_prop.py and add_aa_prop.py to add aa properties for wt and mutant in a given file containing one letter code wt and mut cols as csv 2021-06-14 13:24:00 +01:00
57e4d8cd1e changed aa_prop_water to 3 categ according to KD, updated ref dict 2021-06-14 13:22:56 +01:00
81ab3fe5ba added function and test for aa_prop_bp.R 2021-06-14 09:22:05 +01:00
ca1a0e10ca added example for layout 2021-06-14 09:06:30 +01:00
687adf0ec7 weird pdbtools commit 2021-06-11 21:45:18 +01:00
8fa9faa17d added another aa dict type to reference_dict.py and calculated electrostatisc changes for muts based on adding these properties to mcsm mut stule snps. This will allow the calculation on a given file type since the ref dict can now easily be adapted. 2021-06-11 17:12:21 +01:00
f88e2665e9 calculating af_or using function and cmd options now 2021-06-11 15:12:08 +01:00
7686aa39b4 added script to test af_or_calcs 2021-06-11 13:33:25 +01:00
931f8ec2f9 added mychisq_or.R and af_or_calcs.R 2021-06-11 13:28:07 +01:00
b6df47a0cd moved old af_or_calcs.R to redundant 2021-06-11 13:27:40 +01:00
acda9f13e5 saving the correct af or script 2021-06-11 13:26:28 +01:00
e78707067c saving work before converting to a function 2021-06-11 13:25:02 +01:00
a2431b59e5 minor tweak to plotting_globals.R to make gene_match a global var 2021-06-11 11:21:20 +01:00
f6259aa517 moved functions/ in the scripts dir 2021-06-11 11:11:39 +01:00
0c3645705d moved old bp scripts to redundant 2021-06-10 16:18:08 +01:00
dccdfe9742 moved plotting_func to functions and replaced 3 basic_barplots scripts with 1 2021-06-10 16:09:58 +01:00
5c018e23be added function for position_count_bp.R 2021-06-10 14:46:11 +01:00
4bee48f545 added functions dir for further tidying and tested this with ind scripts for stability 2021-06-09 18:13:18 +01:00
786eaabe1a moved bp function script to function/ 2021-06-09 17:08:56 +01:00
225360fb93 added shiny app and turned stability bp to function 2021-06-09 17:05:02 +01:00
3f58a5c64c saving work 2021-06-09 16:27:05 +01:00
776c4e0279 repuposed and ran basic_barplots for lig and foldx including filenames 2021-06-09 11:33:08 +01:00
d45a9499a2 repurposed basic_barplots_foldx.R 2021-06-09 11:24:50 +01:00
6f24fc1fac uddated how to run plotting scripts. This is a cleaner version to keep up-to-date 2021-06-08 16:53:07 +01:00
ce8abafdfe wrapper script basic_barplots_PS.R now takes cmd and calls functions to generate plots.Tested and verfiied. 2021-06-08 16:48:19 +01:00
b25511a239 tidied plotting_data.R as a function returning a lits of dfs 2021-06-08 16:00:28 +01:00
b8d0bc416a added plotting_globals and text file with info on how to run plotting scripst 2021-06-04 17:26:01 +01:00
d21605b31f tweaking baic bp to make generic 2021-06-04 17:23:41 +01:00
4f60e93abb minor updates to dir.R 2021-06-04 15:05:52 +01:00
7242b3516b adpated combining_dfs.py and plotting.R for gid and attempting to make it generic 2021-06-04 14:36:16 +01:00
d52534f676 test branch commit 2021-06-04 09:43:48 +01:00
18af246c24 saving before starting work 2021-06-04 09:38:17 +01:00
8009c3fe3d updated counts.py with wt seq counts 2021-03-03 11:54:48 +00:00
c59e3f178d added adjusted p-values for DM muts comparison 2021-02-27 10:42:04 +00:00
bbec97b00c updated count.py with indel and stop codon count 2021-02-24 09:56:36 +00:00
9062751790 retrieved results for gid b8 and b9 2021-02-23 08:59:01 +00:00
77efd0b76d retrieved gid b7 and submitted b8,b9 and b10 2021-02-22 09:31:29 +00:00
88dad2696f retrieved results for gid b6 2021-02-21 16:23:22 +00:00
34c0b808ea added count.py to count samples for quick checks 2021-02-21 16:07:33 +00:00
05562399ce saving work and generating revised_figure7 2021-02-20 16:17:38 +00:00
9f03e6a6fd dynamut retrieved b5 and b6, submitted 6 and 7 2021-02-20 13:05:30 +00:00
2995299179 code to retrieve results from batch 4 and 5 once ready 2021-02-19 12:09:26 +00:00
f9249d7bf2 updated .gitignore 2021-02-18 12:01:04 +00:00
d683e971d4 updated .gitignore to include temp dirs 2021-02-18 11:54:36 +00:00
8dc3a790c0 add files 2021-02-18 11:50:46 +00:00
69b62e54a5 running dynamut in batches 2021-02-18 11:27:20 +00:00
cfdd18086a renamed files in dynamut for consistency 2021-02-18 10:52:51 +00:00
9a0e98eb24 renamed file in mcsm_na to be consistent 2021-02-18 10:51:17 +00:00
2168007f12 renaming file 2021-02-18 10:48:06 +00:00
19d89230f5 renamed file run_submit to run_submit_dynamut 2021-02-18 10:45:35 +00:00
a9a4483aee renamed file run_results to run_get_results 2021-02-18 10:43:45 +00:00
cd06a83e13 ran mcsm_na for all 26 batches for gid 2021-02-16 13:55:31 +00:00
013bba2503 sunmitting mcsm_na jobs manually 2021-02-16 10:51:06 +00:00
b69d9d729a added get_results_mcsm_na.py run_get_results.py to retrieve results for each batch run of 20 for mcsm_na 2021-02-15 12:22:52 +00:00
7a74fecbda saving work for mcsm_na 2021-02-15 12:22:19 +00:00
322979406c added mcsm_na_temp 2021-02-12 17:40:02 +00:00
1f72001689 added shell script to format muts for mcsm NA 2021-02-12 17:38:42 +00:00
c99f1cac92 added mcsm_na scripts to submit batches of 20 2021-02-12 16:51:41 +00:00
b2397ea99d minor cody tidy up 2021-02-12 16:50:34 +00:00
9c221e6786 tested and added note to reflect that tar.gz needs to be made into a cmd line option 2021-02-12 15:32:16 +00:00
7f75b92553 checked tar.gz downlaod from the script with example 2021-02-12 15:25:32 +00:00
56f5479c0b added tar.gz download within get_results.py 2021-02-12 15:24:51 +00:00
80f7e039ab separated defs and calls and added a separate script to test examples 2021-02-12 14:15:55 +00:00
4e19961283 updating and cleaning get_results script 2021-02-12 12:04:49 +00:00
7116b45bf8 updating get_results_def.py 2021-02-12 11:38:21 +00:00
28521104f8 added example files to test dynamut results fetching for single and multiple urls 2021-02-11 19:22:19 +00:00
1d8e6f0d75 updated with def for get_results.py for dynamut 2021-02-11 19:21:26 +00:00
2e047fd548 extracting single mut url from the batch processing step 2021-02-11 17:19:04 +00:00
5d6ddb7639 added submit_def.py with example to run batch of 50 2021-02-11 14:36:32 +00:00
cfe9028a9c added split_csv.sh 2021-02-11 13:42:14 +00:00
2eab17cb9e uncommented some debug output for mcsm, pandas and numpy conflict. So temporarily resolved it by running from base env 2021-02-11 10:53:23 +00:00
d159a81cfb saving work in dynamut submit 2021-02-11 09:46:11 +00:00
fad1526ce5 dynamut scripts and minor change dir for rd_df.py 2021-02-10 15:40:33 +00:00
0fd3e75ab0 renamed files 2021-02-10 11:53:20 +00:00
600f829972 added sample test_snps 2021-02-10 10:38:08 +00:00
d139342074 updated minor changes 2021-02-10 10:37:44 +00:00
491b317752 added depricated shell scripts 2021-02-10 10:36:02 +00:00
98287b3c20 updated testing cmds for foldx 2021-02-10 10:32:09 +00:00
ab7bed9f4b added test2/ for testing updated foldx script 2021-02-10 10:16:28 +00:00
56ca9db40d added script to submit jobs 2021-02-09 20:16:27 +00:00
5e735af323 adding and saving files 2021-02-09 18:30:47 +00:00
0c95b3a512 testing dynamut script 2021-02-09 18:28:16 +00:00
bcf4467c44 Merge branch 'master' of https://git.tunstall.in/tanu/LSHTM_analysis 2021-02-09 16:12:34 +00:00
64018cce4c added dynamut dir 2021-02-09 16:11:07 +00:00
6b6921d45f work from thinkpad 2021-02-09 16:03:02 +00:00
534a6754cd add foldx5 wrapper 2021-02-09 15:45:21 +00:00
4163ede798 dont break when the pdb file is in a weird place with a weird name 2021-02-09 15:20:55 +00:00
8302d01867 check to handle missing I/O/P dirs if drug unset 2021-02-09 15:00:03 +00:00
725e9b53ca test2 runfoldx symlink 2021-02-09 14:43:03 +00:00
56150ae3c8 various changes 2021-02-09 14:42:44 +00:00
ca68996264 renamed file runFoldx.py in test2/ to reflect this 2021-02-09 10:54:35 +00:00
86670bbac3 remove shell scripts run with subprocess() and launch foldx directly from python 2021-02-08 18:06:02 +00:00
9df3913a84 modifying script to avoid invoking bash as a subprocess 2021-02-08 16:59:42 +00:00
99b77434b5 more debug 2021-02-08 16:16:53 +00:00
fa25a30dcf fixup broken shell scripts 2021-02-08 15:44:21 +00:00
1f8cfc2403 test2 bugfixes 2021-02-08 15:24:22 +00:00
7a9b16255a added user defined option for processing dir to allow me to specify external storage device for running it 2020-12-02 11:26:26 +00:00
08ad16adbb added chain_extract.py and pdb_chain_extract.py 2020-11-30 14:11:46 +00:00
fc4313045f adding options to specify files by user 2020-11-27 13:02:15 +00:00
20bba2ad70 added my_pdbtools containing pdbtools cloned from a git repo 2020-11-17 13:56:23 +00:00
802522d1c6 updating notes to running_scripts.py as running for another drug-target 2020-11-17 13:55:16 +00:00
ac5b86a9cd modified running script to mention chain info for foldx 2020-11-16 16:16:24 +00:00
2ac4ea8f5c added script to interrogate pdb files mainly for res numbers 2020-11-16 16:01:31 +00:00
ccdd6029be updated results summary in the data_extraction.py 2020-11-12 17:05:29 +00:00
f9fd74812a handling missing dir for data_extraction.py 2020-11-12 13:21:06 +00:00
b0b9e91af7 added what is required as a minimum to run data_extraction 2020-11-06 19:04:27 +00:00
b2284f7216 added base histogram script for af and or 2020-10-13 13:38:17 +01:00
1f9ea3f789 added ns prefix to SNPs for unambiguity 2020-10-13 13:37:22 +01:00
59911687c8 changing labels in graphs for frontiers journal 2020-10-09 13:10:08 +01:00
2f1f02e1de renamed other_plots.R to other_plots_combined.R and changing labels to capital letters for journal 2020-10-09 12:17:24 +01:00
667804ad83 saving work minor changes perhaps 2020-10-08 16:03:12 +01:00
7f5ca7f5a4 added af and OR columns in the data 2020-10-06 19:39:59 +01:00
69f3629cc0 indicated hardcoded active site residues for pnca 2020-10-06 19:12:32 +01:00
be50636b15 script to subset data for dnds cals 2020-10-06 19:11:34 +01:00
4285bbd59f added barplot_subcolours_aa_combined.R to combine and label these plots 2020-10-06 18:43:20 +01:00
18b6407539 adjusted x axis position label for barplot_subcols_aa_LIG.R 2020-10-06 18:42:24 +01:00
9784cba232 generated labelled ps_plots_combined.R and capital "P" for position in barplots coloured aa for Lig 2020-10-06 18:15:50 +01:00
e60b4c5492 output corr plots with coloured dots 2020-10-06 17:47:24 +01:00
9d2d6cfd84 updated TASK in hist_af_or_combined.R 2020-10-06 16:43:59 +01:00
a549e52825 renamed dist_plots.R to dist_plots_check.R as its exploratory 2020-10-06 16:39:24 +01:00
5f441d09d9 added hist_af_or_combined.R to generate plots for output and moved previosu run to scratch_plots/ 2020-10-06 16:33:25 +01:00
f240c969ec added hist_af.R 2020-10-06 15:07:42 +01:00
07104a8c8e updated .gitignore 2020-10-06 09:55:19 +01:00
74c4ef16ae added basic_barplots_foldx.R for supp figure 2020-10-06 09:53:34 +01:00
4c345ea9f4 moved not required plots to scratch 2020-10-06 09:52:54 +01:00
9597997741 saving predictions script 2020-09-30 14:09:08 +01:00
8a6c7968f5 added predictions for ps and lig and output to results 2020-09-30 13:12:05 +01:00
a77b472dfa added prediction.R to do logistic regression 2020-09-30 10:04:49 +01:00
d2093e7a4c added ../data_extraction_epistasis.py for getting list for epistasis work 2020-09-29 16:09:54 +01:00
81796df71a added corr_data.R corr_PS_LIG_all.R corr_PS_LIG_v2.R 2020-09-29 16:08:25 +01:00
c58fa8cd4d added dist_plot.R to generate plots for writing results 2020-09-23 19:24:42 +01:00
48050752db added more analysis in extreme_muts.R to be tidied later 2020-09-23 19:23:34 +01:00
a3aab4556a added fold and duet agreement to extreme_muts.R 2020-09-23 11:20:22 +01:00
6d08b646fc added foldx scaled and foldx outcome to plotting_data.R 2020-09-23 11:12:41 +01:00
5579e9527b updated extreme_muts.R with number of budding hotspots and mult muts numbers 2020-09-23 11:02:13 +01:00
tgttunstall
f7280ceada
Update README.md 2020-09-21 18:11:24 +01:00
tgttunstall
807876d919
Update README.md 2020-09-21 18:11:10 +01:00
tgttunstall
baedea8c5b
Update README.md 2020-09-21 18:09:55 +01:00
tgttunstall
0eca5cf859
Update README.md 2020-09-21 18:08:49 +01:00
tgttunstall
ac3c8a8086
Update README.md 2020-09-21 18:07:58 +01:00
5ceea2e7b7 updated gitignore for more tidying 2020-09-21 17:58:51 +01:00
63fa0c596a updated gitignore to tidyup 2020-09-21 17:54:54 +01:00
7239ab220b remove unneeded dir 2020-09-21 17:49:19 +01:00
2297617af2 added ks_test_all_PS.R, ks_test_dr_PS.R, ks_test_dr_others_PS.R 2020-09-21 17:46:22 +01:00
be8fa7e639 saving combined bubble plot with labels 2020-09-18 18:19:55 +01:00
7e8d5c869e updated .gitignore to include .RData 2020-09-18 18:10:23 +01:00
edabe0d776 added script basic_barplots_combined.R to combine basic barplots for PS and lig 2020-09-18 18:09:24 +01:00
771995d1ab saving work 2020-09-18 18:07:48 +01:00
093ae0d832 added ggcorr all plot figure for supp 2020-09-18 12:46:12 +01:00
369c906a33 added ggcorr plots combined for all params 2020-09-18 11:56:19 +01:00
24b1cc2440 saving work 2020-09-18 11:55:08 +01:00
5e1c920a0c updated Header file and saving work 2020-09-17 20:12:08 +01:00
b8575c6e69 logo_combined.R, outputs logo plot with multiple mutations and log_or 2020-09-17 20:01:57 +01:00
40e4ddd70a minor tweaks in logo and corr plots 2020-09-17 20:00:34 +01:00
8ddca4a8b1 updated corr plots to show points with no colours 2020-09-17 17:17:11 +01:00
883207bc4b updated corr_PS_LIG.R to output both styles of corr plots 2020-09-17 17:04:03 +01:00
ea5d5bda44 renamed corr_plot scripts 2020-09-17 16:38:40 +01:00
f0ee1ff6c9 updated plot name in corr_plots_foldx.R 2020-09-17 16:36:45 +01:00
1b5280145b renamed file to denote corr adjusted and plain 2020-09-17 16:35:35 +01:00
fb0646373b added scratch_plots/ggpairs_test.R to play with ggally for future 2020-09-17 15:32:40 +01:00
5f335a5051 added plotting/corr_plots_style2.Radded my version of pairs.panel with lower panel turned off. Also added new script for corr plots using my version of pairs.panel 2020-09-17 15:31:37 +01:00
63e04ae600 saving work 2020-09-17 15:29:17 +01:00
375cdc2068 added new layput for dm_om and facet_lineage plot 2020-09-17 14:01:04 +01:00
a5b03e53e8 updated with two outputs: labelled and unlabelled 2020-09-16 15:37:56 +01:00
351e472e73 renaming and moving files 2020-09-16 14:57:51 +01:00
b36bfc9e9d renamed file in scratch plot/ 2020-09-16 14:53:53 +01:00
25f2f9e4a2 playing with lineage_dist_dm_om 2020-09-16 13:23:49 +01:00
ba02107e23 added dir scratch_plots/ to practice extra plots 2020-09-16 11:51:17 +01:00
0f6bf3875d updated plotting_data.R with stability colours as variables 2020-09-16 11:47:38 +01:00
83deb64e1c saving work 2020-09-15 13:34:26 +01:00
445f3e2047 updated distribution scripts to try adding points 2020-09-15 13:33:28 +01:00
44d1f64e88 updating lineage_country.R with different data slices 2020-09-15 13:14:33 +01:00
645827570f added ggridges_lineage_country.R for dist by country 2020-09-15 12:50:25 +01:00
ee69445f11 updated gitignore to include TO_DO/ 2020-09-14 17:26:28 +01:00
09e20cf7b3 added mutate.py and run_mutate.sh to create MSA aligbments for mutant sequences required to generate logoplot from sequence in R 2020-09-14 15:17:49 +01:00
3612ef0f2d saving logoplot attempts 2020-09-14 15:13:52 +01:00
a5fdf01d25 added corr_plots_foldx.R 2020-09-11 20:28:18 +01:00
e1da853cf1 updated figure for multi mut plot 2020-09-11 19:30:20 +01:00
968b57105f added logo_multiple_muts.R 2020-09-11 18:12:06 +01:00
431e606448 added check for active site mut count 2020-09-11 17:41:40 +01:00
fadd61bf57 saving extreme muts analysis 2020-09-11 16:43:27 +01:00
7e4be21575 added extreme_muts.R 2020-09-11 16:07:23 +01:00
8d9ede186c added delta symbol to plotting_data.R and pretty labels for dr_other_muts figure 2020-09-11 14:40:37 +01:00
ecbc7541e9 added plotting/other_plots_data.R 2020-09-11 12:52:17 +01:00
1262df40c9 results for electrostatic changes 2020-09-11 10:27:56 +01:00
078644c322 write merged_df3 files from combining_dfs_plotting 2020-09-11 09:51:53 +01:00
c124f49041 add scripts/mut_electrostatic_changes.py 2020-09-10 20:18:35 +01:00
26d0d7f42d updated notes with supp table colnames 2020-09-10 20:15:00 +01:00
c1041ad273 updated logo plot data to source from combining_df_plotting.R 2020-09-10 19:58:33 +01:00
e690f5beba added logo plot 2020-09-10 19:56:33 +01:00
c4225cec4f updated Header file with Logolas and ggseqlogo 2020-09-10 19:55:21 +01:00
d4e75d5f64 added merged_df3_short.csv for supp tables and struct figures 2020-09-10 19:17:05 +01:00
8be1418a32 saving work 2020-09-10 19:16:24 +01:00
6934faca10 saving other_plots.R 2020-09-10 17:53:49 +01:00
5102bbea1b Merge branch 'master' of github.com:tgttunstall/LSHTM_analysis 2020-09-10 16:14:46 +01:00
f415b0b239 changes 2020-09-10 16:06:14 +01:00
cf732a3bcc saving work yet again to be extra sure 2020-09-10 16:03:04 +01:00
65841e4f5b saving recovered combining_dfs_plotting.R after editing 2020-09-10 15:52:22 +01:00
68050a93b4 move combining_dfs_plotting.R 2020-09-10 15:36:17 +01:00
fdecc944fc re-adding deleted combining_dfs_plotting.R 2020-09-10 15:28:10 +01:00
d43ecfa1dc updated gitignore and saving workk 2020-09-10 14:45:10 +01:00
1708194912 added boxplots and stats for other numerical params 2020-09-10 14:09:40 +01:00
fc47c58f91 saving work after correlation plots 2020-09-09 20:56:07 +01:00
9bee97052e added correlation plots 2020-09-09 20:48:21 +01:00
f3f86d6651 renamed file 2020-09-09 19:11:06 +01:00
2c2c2c1a60 regenerated combined_or figure with correct muts 2020-09-09 19:03:52 +01:00
f85b1bd902 script to generate combined ps plot wit af and or 2020-09-09 18:57:28 +01:00
e570454cf2 saving work 2020-09-09 18:56:59 +01:00
5025e47983 renamed lineage_dist 2020-09-09 17:34:32 +01:00
f424f4e2d6 corrected subcols_axis name in sucols_all_PS 2020-09-09 13:36:37 +01:00
080cd6375d lineage dist plots combined generated
Please enter the commit message for your changes. Lines starting
2020-09-09 13:18:57 +01:00
19a984f228 generated lineage dist plots combined. needs tweaking 2020-09-09 12:53:53 +01:00
31b98fb3d3 plotting script with resolved gene metadata 2020-09-09 12:00:42 +01:00
774b34ef00 updated dir.R 2020-09-09 11:45:09 +01:00
09e4f7bfbd add dirs and resolving_ambiguous_muts 2020-09-09 11:36:40 +01:00
b7c7ffc018 `resolved ambiguous muts and generated clean output. Also seaprated dir.R 2020-09-09 11:26:13 +01:00
46b43cf261 changing category of ambiguous muts 2020-09-08 18:51:03 +01:00
eb5491aad9 outputting revised all params file 2020-09-08 17:52:45 +01:00
42986bb119 hopefully finally sorted data merges! 2020-09-08 17:46:52 +01:00
fe49a45447 various changes 2020-09-08 17:13:02 +01:00
5d9561f88a trying other num param plots 2020-09-07 17:17:56 +01:00
648be02665 ks test script added 2020-09-07 15:27:53 +01:00
b4affa0c94 Combining dfs for PS and lig in one 2020-09-07 14:05:46 +01:00
2ef767f046 lineage barplot script 2020-09-07 11:29:28 +01:00
db87f98d32 updated giignore 2020-09-04 22:46:07 +01:00
7460c7c97f updated combining_two_df.R for plots 2020-09-04 22:43:30 +01:00
dd1158a66c script to plot lineage dist plots 2020-09-04 22:40:49 +01:00
645868ea27 adding missing mutation col in combining_dfs 2020-09-04 21:04:18 +01:00
ddefcd7841 resolving missing mutation info in combining script 2020-09-04 20:56:16 +01:00
bba3487829 added running scripts doc 2020-08-26 17:20:01 +01:00
3f8d6695a4 all barplots generated for ps and lig 2020-08-26 17:18:45 +01:00
0220960975 reflected change in running_scripts doc 2020-08-26 16:41:10 +01:00
89e881b5d4 renamed file to reflect sucols_axis is commons script sourced by ps and lig plots 2020-08-26 16:40:36 +01:00
0e3f9e584b sorted subcols_axis script to generate correct axis cols for both PS and lig plots 2020-08-26 16:39:10 +01:00
482eeadb9a generated subcolour bps for PS 2020-08-26 12:45:09 +01:00
ed739aeb71 sourcing plotting_data for subcols_axis_PS 2020-08-26 12:07:04 +01:00
b754f26f9b added ligand df in plotting 2020-08-26 10:02:44 +01:00
73877942f4 added instructions on running plot scripts 2020-08-24 14:38:45 +01:00
75273cebbf generated replaced Bfactor pdbs 2020-08-24 14:37:28 +01:00
54f9fd073b rectified mcsm_mean_stability to avereage on raw values and then scale 2020-08-24 13:04:25 +01:00
d76345c3de saving work to check merge conflicts resolved 2020-08-24 11:20:58 +01:00
f468554427 sourced plotting script in mean_stability calcs 2020-08-21 17:33:09 +01:00
a448d9276b added plotting scripts from old run 2020-08-21 13:25:01 +01:00
d78626048c script to format snp_info.txt 2020-08-21 13:23:29 +01:00
acd0b8355b updated script to combine dfs 2020-08-21 13:22:28 +01:00
841d18d10b sorted df by position for output in data_extraction 2020-08-14 17:57:12 +01:00
48773a19ef tidy script for linking or_kinship with missense variant info 2020-08-14 16:41:11 +01:00
f8f33abad8 removed if clause for filenames 2020-08-13 18:39:16 +01:00
2d8cb01cb7 added output file for checking 2020-08-11 18:34:02 +01:00
dcd9a985ec saving work, ready for more remote working 2020-08-07 13:35:02 +01:00
13203e6fe0 added data cjeckings script 2020-08-07 13:34:24 +01:00
61e41f1697 saving work 2020-08-07 13:33:44 +01:00
efe0178f4e separting data processing from plotting, started with basic_barplots_PS script 2020-07-16 18:59:17 +01:00
7d1ecbb660 replaced single quotes with double in R scripts 2020-07-16 14:18:18 +01:00
5e1b39cea0 mean stability values calcs and replaceBfactor plots 2020-07-16 14:12:08 +01:00
1f44f8ec0a calculating mean stabilty per position
please enter the commit message for your changes. Lines starting
2020-07-16 10:37:40 +01:00
1e785a08a1 scripts generating axis coloured subcols bp for PS 2020-07-15 16:31:10 +01:00
3cb33df009 made tweaks to output plot filenames 2020-07-15 16:29:36 +01:00
55f03bc343 adding plots as I tidy and generate 2020-07-15 13:50:07 +01:00
e41fb78e37 saved work before adding plots 2020-07-15 13:36:20 +01:00
e4270b67c8 saving work for today 2020-07-14 16:13:17 +01:00
2bc5be20b9 resolving merge conflicts dur to shoddy data 2020-07-14 14:09:42 +01:00
7d36e0e36b fixed white space prob with mcsm input with merge 2020-07-14 14:07:23 +01:00
46b1505fdf remove white space in colnames before mcsm format output 2020-07-14 12:59:40 +01:00
83383b4493 finding discrepancy in merging or dfs,grrrr 2020-07-13 18:31:29 +01:00
9e8469abe3 trying to resolve copy warning in code 2020-07-13 12:20:43 +01:00
57a966c7c4 added sanity checks for or_kinship calcs 2020-07-13 11:37:43 +01:00
f9500d5324 added sanity checks for or_kin 2020-07-10 15:24:57 +01:00
5677175423 refactoring or_kin script minor changes only 2020-07-10 12:38:42 +01:00
c80faef0bf refactoring or_kin script minor changes only 2020-07-10 12:37:41 +01:00
aaf3f5e084 added cleaned up af_or_calcs.R 2020-07-09 15:55:16 +01:00
d3d82623d2 added consistent style scripts to format kd & rd values 2020-07-09 14:08:27 +01:00
e4a7deae7b minor tidy up in foldx, mcsm and dssp scripts 2020-07-09 14:04:16 +01:00
0379d3e241 renamed mcsm_wrapper to run_mcsm 2020-07-09 13:33:56 +01:00
91348aaae2 added dssp.py with refactored argeparse 2020-07-09 12:58:55 +01:00
f8e345f5bc adding default dirs and filenames to argparse in foldx and mcsm 2020-07-09 12:57:08 +01:00
6402990154 minor edits to format mcsm data like sorting df 2020-07-09 11:15:56 +01:00
01fbc2a87b ran foldx and mcsm (get) for 33k dataset 2020-07-08 20:30:32 +01:00
0e71b23759 modified extraction to be explicit for extracting nsSNP for specified gene 2020-07-08 18:47:22 +01:00
1fa0dc6ad4 minor changes in data extraction 2020-07-08 16:01:54 +01:00
c958cc1081 data extraction tidy up 2020-07-08 13:26:33 +01:00
a4670b9944 saving work for the day 2020-07-07 18:31:14 +01:00
a7f21cfb14 adding clean files for rerrun 35k dataset 2020-07-07 18:28:55 +01:00
943513a338 added script to combine all files in one 2020-07-07 16:06:11 +01:00
5addb85851 renamed files that combine dfs 2020-07-07 15:46:13 +01:00
a220288c5f testing combining df script 2020-07-03 19:23:23 +01:00
262bd79204 stil fiddling iwth combining dfs 2020-07-03 19:22:46 +01:00
90cbb49560 added fixme: for some necessary required changes 2020-07-02 14:16:40 +01:00
f758c01159 added combining funct & combining_mcsm_foldx script 2020-07-01 16:41:58 +01:00
4d686e2933 refactor foldx pipeline to include:
* command-line args
* creating necessary dirs automagically
* code cleanup, syntax errors, etc etc
2020-06-30 17:14:30 +01:00
af65a86ff9 updated code and made it tidy 2020-06-25 14:40:44 +01:00
3c6122a296 tidying script 2020-06-25 13:12:09 +01:00
b82cc11dbe updated ref dict to create separate dicts 2020-06-24 14:10:39 +01:00
626ed3a57b added commonly used mutation format for missense muts in the gene_specific nssnp_info file 2020-06-24 13:34:35 +01:00
a298071309 combined and output all ors 2020-06-23 17:34:54 +01:00
003b22ce3f script for calcuating various OR & output csv 2020-06-23 13:07:29 +01:00
a1cc7ee33d further tidy for OR calcs 2020-06-23 12:19:26 +01:00
1e43ca8136 tody scracth script for various OR calcs 2020-06-23 11:57:51 +01:00
18998092f4 all OR calcs using sapply and output as df 2020-06-22 18:17:06 +01:00
8f272bdc17 extracting other params from logistic 2020-06-22 14:11:16 +01:00
ada205962b script to combine ors and afs 2020-06-22 13:07:26 +01:00
0c3c6fd143 script to combine all ors 2020-06-19 14:43:23 +01:00
3497d1ef54 renamed files & added or kinship link file 2020-06-19 10:33:26 +01:00
fa2bcb5f05 updated Af and OR calcs script with argeparse and minor tidyup 2020-06-18 18:37:55 +01:00
76ecb65a1a getopt and commandArgs examples, and AF/OR update to use getopt() 2020-06-18 17:59:28 +01:00
6c2c7e0a90 removed merging df for AF_OR 2020-06-18 16:10:02 +01:00
b33419c939 af and or calcs, not merging 2020-06-18 15:57:25 +01:00
010ef133dd foramtting and adding or 2020-06-18 13:55:45 +01:00
fdba990b80 added AF_and OR calcs script and making it generic 2020-06-17 19:36:34 +01:00
8d1daabff4 ran struc param analysis 2020-06-17 19:36:02 +01:00
e21635fe02 inlcuded the revised master file for 35k isolates 2020-06-16 11:39:11 +01:00
e2f319ba42 various debug, doc, and args 2020-05-25 14:27:25 +01:00
f6fc6e47ab added scratch/ 2020-05-22 12:03:11 +01:00
3fe1d35df5 building script for inspecting pdb 2020-05-22 11:57:59 +01:00
ca36e004c1 fixing hetatm script 2020-05-21 12:54:10 +01:00
15dea0cbf6 added script for pairwise alignment 2020-05-15 17:58:14 +01:00
548d9a5192 tidy up code 2020-05-15 13:48:50 +01:00
f7e371a585 script for saving pdb chains in single file 2020-05-15 13:44:57 +01:00
01a7cbf26e renamed extract chain file 2020-05-15 10:59:19 +01:00
65db4a090e added pdb_chain splitter code and wrapper 2020-05-13 16:54:20 +01:00
3425d8fa2b added pdbtools from github source and modified seq.py to exclude hetatm seq extraction 2020-05-12 14:08:08 +01:00
7f66d5d19e adding commands for use of pdbtools 2020-05-12 12:50:49 +01:00
b28d866237 handle not ready (refresh) url
Please enter the commit message for your changes. Lines starting
2020-04-21 17:12:18 +01:00
a405aa17c3 moved scripts to /ind_scripts & added add col to formatting script 2020-04-20 12:52:10 +01:00
e94da61871 fixed indentation error and ran mcsm_wrapper dcs 2020-04-17 12:19:08 +01:00
e50466da39 add wrapper and mcsm library 2020-04-16 17:45:24 +01:00
7aafa72e10 defined method for formatting mcsm_results 2020-04-14 11:30:36 +01:00
45889990e7 saving work for the day 2020-04-11 19:00:39 +01:00
7d2241ad81 added lambda func to normalise duet and aff values 2020-04-11 18:52:57 +01:00
398eccd246 added script to format results 2020-04-10 19:32:47 +01:00
f5241048b4 saving work for today 2020-04-09 16:40:45 +01:00
0550cfe0e2 adding separae script for getting results for mcsm 2020-04-09 15:42:56 +01:00
7cee9b21e2 refactoring bash into python to run mcsm 2020-04-08 18:27:51 +01:00
7a8bbc6595 minor tweaks 2020-04-08 18:27:09 +01:00
fe3d431a3d combine df script with command line args and added method 2020-04-08 12:44:17 +01:00
c025a22343 correcting indendation 2020-04-08 12:43:37 +01:00
30aa64fd2b refactoring: added command line args to combine_dfs 2020-04-08 11:44:53 +01:00
49a38dd1ae saving work for today 2020-04-07 17:57:34 +01:00
569b7c6c7f adapted rd_df script to make it take command line args and define function 2020-04-07 17:42:59 +01:00
811027e34d tidy kd_df script 2020-04-07 17:42:06 +01:00
02488ea23e adapted kd calc script with command line args and made it into a function 2020-04-07 16:45:59 +01:00
6afe202931 kd script with command line args and as function 2020-04-07 16:39:50 +01:00
44577b4a0c updating kd script to take command line args 2020-04-07 16:13:54 +01:00
24c7ade7c4 renamed file for consistency 2020-04-07 16:04:01 +01:00
f690c75ca0 modified dssp_df to handle multiple chains 2020-04-07 16:02:19 +01:00
d161fcd0f3 added dssp.py that runs, processes and outputs csv 2020-04-07 15:08:18 +01:00
b0e56328ef adding settings params 2020-04-06 19:04:35 +01:00
cc9cdbcad5 refactoring code to make it take command line args 2020-04-06 19:03:41 +01:00
b5aa524914 logoplot from df and seqs with custom height 2020-03-29 17:11:17 +01:00
34a2057d29 added R header file to base dir to allow general access by R scripts 2020-03-28 17:56:39 +00:00
b1e4dcd376 tidied combining plot scripts 2020-03-28 17:54:45 +00:00
e7f2a3aada added mutate.py script for msa generation 2020-03-27 17:11:16 +00:00
ab541aa3de saving work for the day 2020-03-27 17:08:33 +00:00
d1da203df0 changed filename to the new combined output (mcsm+struct params) 2020-03-27 12:43:48 +00:00
82e96fcdba combining mcsm and struct params 2020-03-27 12:39:02 +00:00
afd6ca8881 tidy code and saving work for the day 2020-03-26 17:58:39 +00:00
69e2567ffc added script to combined dfs of structural params like kd, dssp & rd 2020-03-26 17:14:20 +00:00
c0bac6fd7b changed outcols in dssp and kd outputs 2020-03-26 17:12:59 +00:00
5bab99c15f added residue depth processing to generate df 2020-03-26 15:44:20 +00:00
0b7a938fbd tidy code and renamed kd.py to kd_df.py 2020-03-26 15:43:13 +00:00
4c2fa2b600 tidied and updated kd and dssp scripts & generated their respective outputs 2020-03-25 18:19:23 +00:00
87a847109a updated kd.py to relfect a merging col for combining num params later 2020-03-25 15:20:54 +00:00
de1822f491 output from comb script & electrostatic mut changes calculated 2020-03-25 13:42:18 +00:00
96ebb85069 updated combining df scripts for duet & lig 2020-03-24 18:28:52 +00:00
c184841951 minor changes to variable names in .R & .py 2020-03-24 10:36:51 +00:00
dd91692673 renamed files to make more generic 2020-03-23 18:13:02 +00:00
22a0d38563 renamed files to make more generic 2020-03-23 17:48:39 +00:00
d42e6fbdb3 fixed bugs and tidy code 2020-03-23 17:43:06 +00:00
b4dbad7e54 delete old file 2020-03-23 17:40:19 +00:00
b331227023 updated pnca_extraction and AF_OR calcs 2020-03-23 17:36:42 +00:00
eb021349fe bug fixes and massive clean up of data extraction script 2020-03-23 13:33:25 +00:00
8df0b7d920 saving from work 2020-02-27 15:16:20 +00:00
77cc5bf42c renamed file and updated logo plot code 2020-02-26 12:00:32 +00:00
95e8205189 added 2 logo plot scripts 2020-02-25 19:09:43 +00:00
f9837b474c updating mut_seq script 2020-02-25 18:13:18 +00:00
e9a95e9d3a hydrophobicity script 2020-02-25 10:42:58 +00:00
ed8fc4d488 remove old surface_res3.py 2020-02-20 12:23:56 +00:00
d7ef8ef51e fixup 2020-02-20 10:41:49 +00:00
b56c0b8b68 adding scripts for struct params 2020-02-16 15:14:36 +00:00
4ef68bdc1b remove __pycache__, update .gitignore 2020-02-16 15:08:45 +00:00
b97712edb0 test commit 2020-02-16 15:00:49 +00:00
9e4b3c5dce added script to calculate electrostatic changes of mutations 2020-02-11 15:03:21 +00:00
0653a8c1e3 updated ref dict to inc aa_calcprop 2020-02-11 15:02:32 +00:00
d12ef0ef00 saving a and b labels in bubble plot with brackets 2020-02-02 11:39:35 +00:00
d9519b6262 added script for KS_test for DUET 2020-02-02 11:36:17 +00:00
134dea609d tidy code for lineage_dist_PS 2020-02-02 11:14:25 +00:00
8c7c389562 tidying script for lineage dist PS and separating KS test results 2020-02-02 11:11:49 +00:00
632b78320a added bubble plot 2020-02-02 09:17:11 +00:00
c15d1a8a95 added script for coloured axis for ligand affinity 2020-01-31 16:39:22 +00:00
3390f80168 remove .Rhistory 2020-01-31 15:35:25 +00:00
1d80186ab9 Merge branch 'master' of https://git.tunstall.in/tanu/LSHTM_analysis 2020-01-31 15:34:58 +00:00
15daa6dfc1 remove .Rhistory 2020-01-31 15:32:32 +00:00
ac34de9e79 added subaxis plots for PS and lig separately 2020-01-31 15:30:08 +00:00
f1584bddb1 saving previous stuff from work 2020-01-30 08:26:21 +00:00
6cbef0c3d7 tidy script for data extraction 2020-01-28 11:53:10 +00:00
1edfe3f8f8 Merge branch 'master' of github.com:tgttunstall/LSHTM_analysis 2020-01-28 10:17:24 +00:00
tgttunstall
8d2456f7f2 Update README.md 2020-01-28 10:14:08 +00:00
15391a5700 saving data_extraction from home 2020-01-28 10:13:01 +00:00
c3c50f65f2 saving previous work from home pc 2020-01-28 10:13:01 +00:00
4d2d03f634 added coloured axis barplots 2020-01-28 10:13:01 +00:00
bcf822d6e4 updated lineage dist for LIG for consistency 2020-01-28 10:13:01 +00:00
4f06e42ee4 graphs for PS lineage dist for all and dr muts 2020-01-28 10:13:01 +00:00
4bcb81e9be saving data_extraction from home 2020-01-28 10:10:16 +00:00
be213cb7e9 saving previous work from home pc 2020-01-23 09:31:35 +00:00
cae9c550a4 added coloured axis barplots 2020-01-22 15:09:21 +00:00
2df031c02a updated lineage dist for LIG for consistency 2020-01-22 11:34:59 +00:00
c1ea688c5c graphs for PS lineage dist for all and dr muts 2020-01-22 10:12:09 +00:00
tgttunstall
ec37e3c1f6
Update README.md
Updated README.md
2020-01-14 11:29:13 +00:00
tgttunstall
50ade050c2
Update README.md 2020-01-14 11:22:41 +00:00
200 changed files with 506005 additions and 7016 deletions

17
.gitignore vendored
View file

@ -1,6 +1,23 @@
*.xls
*.xlsx
*.ods
*.tar.gz
.Rhistory
*.pyc
__pycache__
*/__pycache__
manual_*
*temp*
mcsm_analysis_fixme
meta_data_analysis
del
example*
scratch
historic
test
plotting_test
*old*
foldx/test/
TO_DO
.RData
scratch_plots

View file

@ -1,35 +1,39 @@
mCSM Analysis
mCSM
=============
This repo does mCSM analysis using Python, bash and R.
Requires an additional 'Data' directory. Batteries not included.
This contains scripts that does the following:
1. mcsm.py: function for submitting mcsm job and extracting results
2. run_mcsm.py: wrapper to call mcsm.py
foldx
=============
This contains scripts that does the following:
1. runFoldx.py: submitting foldx requests and extracting results
2. runfoldx.sh: is wrapped by runFoldx.py
Requires an additional 'Data' directory. Batteries not included:-)
## Assumptions
1. git repos are cloned to `~/git`
2. Requires a `Data/` in `~/git` which has the struc created by `mk_drug_dirs.sh`
2. Requires a data directory with an `input` and `output` subdirs. Can be specified on the CLI with `--datadir`, and optionally can be created with `mk_drug_dirs.sh <DRUG_NAME>`
## LSHTM\_analysis:
subdirs within this repo
```
meta\_data\_analysis/
scripts
*.R
*.py
mcsm\_analysis/
<drug>/
scripts/
*.R
*.py
mcsm/
*.sh
*.py
*.R
plotting/
*.R
plotting/
*.R
mcsm
*.py
foldx
*.py
*.sh
```
More docs here as I write them.
More docs here as I write them.

162
dynamut/format_results_dynamut.py Executable file
View file

@ -0,0 +1,162 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 19 14:33:51 2020
@author: tanu
"""
#%% load packages
import os,sys
import subprocess
import argparse
import requests
import re
import time
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
#%%#####################################################################
def format_dynamut_output(dynamut_output_csv):
"""
@param dynamut_output_csv: file containing dynamut results for all muts
which is the result of combining all dynamut_output batch results, and using
bash scripts to combine all the batch results into one file.
This is post run_get_results_dynamut.py
Formatting df to a pandas df and output as csv.
@type string
@return (not true) formatted csv for dynamut output
@type pandas df
"""
#############
# Read file
#############
dynamut_data_raw = pd.read_csv(dynamut_output_csv, sep = ',')
# strip white space from both ends in all columns
dynamut_data = dynamut_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
dforig_shape = dynamut_data.shape
print('dimensions of input file:', dforig_shape)
#%%============================================================================
#####################################
# create binary cols for each param
# >=0: Stabilising
######################################
outcome_cols = ['ddg_dynamut', 'ddg_encom', 'ddg_mcsm','ddg_sdm', 'ddg_duet']
# col test: ddg_dynamut
#len(dynamut_data[dynamut_data['ddg_dynamut'] >= 0])
#dynamut_data['ddg_dynamut_outcome'] = dynamut_data['ddg_dynamut'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
#len(dynamut_data[dynamut_data['ddg_dynamut_outcome'] == 'Stabilising'])
print('\nCreating classification cols for', len(outcome_cols), 'columns'
, '\nThese are:')
for cols in outcome_cols:
print(cols)
tot_muts = dynamut_data[cols].count()
print('\nTotal entries:', tot_muts)
outcome_colname = cols + '_outcome'
print(cols, ':', outcome_colname)
c1 = len(dynamut_data[dynamut_data[cols] >= 0])
dynamut_data[outcome_colname] = dynamut_data[cols].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
c2 = len(dynamut_data[dynamut_data[outcome_colname] == 'Stabilising'])
if c1 == c2:
print('\nPASS: outcome classification column created successfully'
, '\nColumn created:', outcome_colname
#, '\nNo. of stabilising muts: ', c1
#, '\nNo. of DEstabilising muts: ', tot_muts-c1
, '\n\nCateg counts:\n', dynamut_data[outcome_colname].value_counts() )
else:
print('\nFAIL: outcome classification numbers MISmatch'
, '\nexpected length:', c1
, '\nGot:', c2)
# Rename categ for: dds_encom
len(dynamut_data[dynamut_data['dds_encom'] >= 0])
dynamut_data['dds_encom_outcome'] = dynamut_data['dds_encom'].apply(lambda x: 'Increased_flexibility' if x >= 0 else 'Decreased_flexibility')
dynamut_data['dds_encom_outcome'].value_counts()
#%%=====================================================================
################################
# scale all ddg param values
#################################
# Rescale values in all ddg cols col b/w -1 and 1 so negative numbers
# stay neg and pos numbers stay positive
outcome_cols = ['ddg_dynamut', 'ddg_encom', 'ddg_mcsm','ddg_sdm', 'ddg_duet', 'dds_encom']
for cols in outcome_cols:
#print(cols)
col_max = dynamut_data[cols].max()
col_min = dynamut_data[cols].min()
print( '\n===================='
, '\nColname:', cols
, '\n===================='
, '\nMax: ', col_max
, '\nMin: ', col_min)
scaled_colname = cols + '_scaled'
print('\nCreated scaled colname for', cols, ':', scaled_colname)
col_scale = lambda x : x/abs(col_min) if x < 0 else (x/col_max if x >= 0 else 'failed')
dynamut_data[scaled_colname] = dynamut_data[cols].apply(col_scale)
col_scaled_max = dynamut_data[scaled_colname].max()
col_scaled_min = dynamut_data[scaled_colname].min()
print( '\n===================='
, '\nColname:', scaled_colname
, '\n===================='
, '\nMax: ', col_scaled_max
, '\nMin: ', col_scaled_min)
#%%=====================================================================
#############
# reorder columns
#############
dynamut_data.columns
dynamut_data_f = dynamut_data[['mutationinformation'
, 'ddg_dynamut'
, 'ddg_dynamut_scaled'
, 'ddg_dynamut_outcome'
, 'ddg_encom'
, 'ddg_encom_scaled'
, 'ddg_encom_outcome'
, 'ddg_mcsm'
, 'ddg_mcsm_scaled'
, 'ddg_mcsm_outcome'
, 'ddg_sdm'
, 'ddg_sdm_scaled'
, 'ddg_sdm_outcome'
, 'ddg_duet'
, 'ddg_duet_scaled'
, 'ddg_duet_outcome'
, 'dds_encom'
, 'dds_encom_scaled'
, 'dds_encom_outcome']]
if len(dynamut_data.columns) == len(dynamut_data_f.columns) and sorted(dynamut_data.columns) == sorted(dynamut_data_f.columns):
print('\nPASS: outcome_classification, scaling and column reordering completed')
else:
print('\nFAIL: Something went wrong...'
, '\nExpected length: ', len(dynamut_data.columns)
, '\nGot: ', len(dynamut_data_f.columns))
sys.exit()
return(dynamut_data_f)
#%%#####################################################################

View file

@ -0,0 +1,137 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 19 14:33:51 2020
@author: tanu
"""
#%% load packages
import os,sys
import subprocess
import argparse
import requests
import re
import time
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
#%%#####################################################################
def format_dynamut2_output(dynamut_output_csv):
"""
@param dynamut_output_csv: file containing dynamut2 results for all muts
which is the result of combining all dynamut2_output batch results, and using
bash scripts to combine all the batch results into one file.
Dynamut2ran manually from batches
Formatting df to a pandas df and output as csv.
@type string
@return (not true) formatted csv for dynamut output
@type pandas df
"""
#############
# Read file
#############
dynamut_data_raw = pd.read_csv(dynamut_output_csv, sep = ',')
# strip white space from both ends in all columns
dynamut_data = dynamut_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
dforig_shape = dynamut_data.shape
print('dimensions of input file:', dforig_shape)
#%%============================================================================
#####################################
# create binary cols for ddg_dynamut2
# >=0: Stabilising
######################################
outcome_cols = ['ddg_dynamut2']
# col test: ddg_dynamut
#len(dynamut_data[dynamut_data['ddg_dynamut'] >= 0])
#dynamut_data['ddg_dynamut_outcome'] = dynamut_data['ddg_dynamut'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
#len(dynamut_data[dynamut_data['ddg_dynamut_outcome'] == 'Stabilising'])
print('\nCreating classification cols for', len(outcome_cols), 'columns'
, '\nThese are:')
for cols in outcome_cols:
print(cols)
tot_muts = dynamut_data[cols].count()
print('\nTotal entries:', tot_muts)
outcome_colname = cols + '_outcome'
print(cols, ':', outcome_colname)
c1 = len(dynamut_data[dynamut_data[cols] >= 0])
dynamut_data[outcome_colname] = dynamut_data[cols].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
c2 = len(dynamut_data[dynamut_data[outcome_colname] == 'Stabilising'])
if c1 == c2:
print('\nPASS: outcome classification column created successfully'
, '\nColumn created:', outcome_colname
#, '\nNo. of stabilising muts: ', c1
#, '\nNo. of DEstabilising muts: ', tot_muts-c1
, '\n\nCateg counts:\n', dynamut_data[outcome_colname].value_counts() )
else:
print('\nFAIL: outcome classification numbers MISmatch'
, '\nexpected length:', c1
, '\nGot:', c2)
#%%=====================================================================
################################
# scale all ddg_dynamut2 values
#################################
# Rescale values in all ddg_dynamut2 col col b/w -1 and 1 so negative numbers
# stay neg and pos numbers stay positive
outcome_cols = ['ddg_dynamut2']
for cols in outcome_cols:
#print(cols)
col_max = dynamut_data[cols].max()
col_min = dynamut_data[cols].min()
print( '\n===================='
, '\nColname:', cols
, '\n===================='
, '\nMax: ', col_max
, '\nMin: ', col_min)
scaled_colname = cols + '_scaled'
print('\nCreated scaled colname for', cols, ':', scaled_colname)
col_scale = lambda x : x/abs(col_min) if x < 0 else (x/col_max if x >= 0 else 'failed')
dynamut_data[scaled_colname] = dynamut_data[cols].apply(col_scale)
col_scaled_max = dynamut_data[scaled_colname].max()
col_scaled_min = dynamut_data[scaled_colname].min()
print( '\n===================='
, '\nColname:', scaled_colname
, '\n===================='
, '\nMax: ', col_scaled_max
, '\nMin: ', col_scaled_min)
#%%=====================================================================
#############
# reorder columns
#############
dynamut_data.columns
dynamut_data_f = dynamut_data[['mutationinformation'
, 'chain'
, 'ddg_dynamut2'
, 'ddg_dynamut2_scaled'
, 'ddg_dynamut2_outcome']]
if len(dynamut_data.columns) == len(dynamut_data_f.columns) and sorted(dynamut_data.columns) == sorted(dynamut_data_f.columns):
print('\nPASS: outcome_classification, scaling and column reordering completed')
else:
print('\nFAIL: Something went wrong...'
, '\nExpected length: ', len(dynamut_data.columns)
, '\nGot: ', len(dynamut_data_f.columns))
sys.exit()
return(dynamut_data_f)
#%%#####################################################################

98
dynamut/get_results_dynamut.py Executable file
View file

@ -0,0 +1,98 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 19 14:33:51 2020
@author: tanu
"""
#%% load packages
import os,sys
import subprocess
import argparse
import requests
import re
import time
from bs4 import BeautifulSoup
import pandas as pd
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
#%%#####################################################################
def get_results(url_file, host_url, output_dir, outfile_suffix):
# initilialise empty df
dynamut_results_out_df = pd.DataFrame()
with open(url_file, 'r') as f:
for count, line in enumerate(f):
line = line.strip()
print('URL no.', count+1, '\n', line)
#batch_response = requests.get(line, headers=headers)
batch_response = requests.get(line)
batch_soup = BeautifulSoup(batch_response.text, features = 'html.parser')
# initilialise empty df
#dynamut_results_df = pd.DataFrame()
for a in batch_soup.find_all('a', href=True, attrs = {'class':'btn btn-default btn-sm'}):
print ("Found the URL:", a['href'])
single_result_url = host_url + a['href']
snp = re.search(r'([A-Z]+[0-9]+[A-Z]+$)', single_result_url).group(0)
print(snp)
print('\nGetting results from:', single_result_url)
result_response = requests.get(single_result_url)
if result_response.status_code == 200:
print('\nFetching results for SNP:', snp)
# extract results using the html parser
soup = BeautifulSoup(result_response.text, features = 'html.parser')
#web_result_raw = soup.find(id = 'predictions').get_text()
ddg_dynamut = soup.find(id = 'ddg_dynamut').get_text()
ddg_encom = soup.find(id = 'ddg_encom').get_text()
ddg_mcsm = soup.find(id = 'ddg_mcsm').get_text()
ddg_sdm = soup.find(id = 'ddg_sdm').get_text()
ddg_duet = soup.find(id = 'ddg_duet').get_text()
dds_encom = soup.find(id = 'dds_encom').get_text()
param_dict = {"mutationinformation" : snp
, "ddg_dynamut" : ddg_dynamut
, "ddg_encom" : ddg_encom
, "ddg_mcsm" : ddg_mcsm
, "ddg_sdm" : ddg_sdm
, "ddg_duet" : ddg_duet
, "dds_encom" : dds_encom
}
results_df = pd.DataFrame.from_dict(param_dict, orient = "index").T
print('Result DF:', results_df, 'for URL:', line)
#dynamut_results_df = dynamut_results_df.append(results_df)#!1 too many!:-)
dynamut_results_out_df = dynamut_results_out_df.append(results_df)
#print(dynamut_results_out_df)
#============================
# Writing results file: csv
#============================
dynamut_results_dir = output_dir + 'dynamut_results/'
if not os.path.exists(dynamut_results_dir):
print('\nCreating dir: dynamut_results within:', output_dir )
os.makedirs(dynamut_results_dir)
print('\nWriting dynamut results df')
print('\nResults File:'
, '\nNo. of rows:', dynamut_results_out_df.shape[0]
, '\nNo. of cols:', dynamut_results_out_df.shape[1])
print(dynamut_results_out_df)
#dynamut_results_out_df.to_csv('/tmp/test_dynamut.csv', index = False)
# build out filename
out_filename = dynamut_results_dir + 'dynamut_output_' + outfile_suffix + '.csv'
dynamut_results_out_df.to_csv(out_filename, index = False)
# TODO: add as a cmd option
# Download .tar.gz file
prediction_number = re.search(r'([0-9]+$)', line).group(0)
tgz_url = f"{host_url}/dynamut/results_file/results_" + prediction_number + '.tar.gz'
tgz_filename = dynamut_results_dir + outfile_suffix + '_results_' + prediction_number + '.tar.gz'
response_tgz = requests.get(tgz_url, stream = True)
if response_tgz.status_code == 200:
print('\nDownloading tar.gz file:', tgz_url
, '\n\nSaving file as:', tgz_filename)
with open(tgz_filename, 'wb') as f:
f.write(response_tgz.raw.read())
#%%#####################################################################

View file

@ -0,0 +1,101 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 12 12:15:26 2021
@author: tanu
"""
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# FIXME
# RE RUN when B07 completes!!!! as norm gets affected!
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#%% load packages
import os
homedir = os.path.expanduser('~')
os.chdir (homedir + '/git/LSHTM_analysis/dynamut')
from format_results_dynamut import *
from format_results_dynamut2 import *
########################################################################
#%% command line args
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug' , help = 'drug name (case sensitive)', default = None)
arg_parser.add_argument('-g', '--gene' , help = 'gene name (case sensitive)', default = None)
arg_parser.add_argument('--datadir' , help = 'Data Directory. By default, it assmumes homedir + git/Data')
arg_parser.add_argument('-i', '--input_dir' , help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
#arg_parser.add_argument('--mkdir_name' , help = 'Output dir for processed results. This will be created if it does not exist')
arg_parser.add_argument('-m', '--make_dirs' , help = 'Make dir for input and output', action='store_true')
arg_parser.add_argument('--debug' , action = 'store_true' , help = 'Debug Mode')
args = arg_parser.parse_args()
#%%============================================================================
# variable assignment: input and output paths & filenames
drug = args.drug
gene = args.gene
datadir = args.datadir
indir = args.input_dir
outdir = args.output_dir
#outdir_dynamut2 = args.mkdir_name
make_dirs = args.make_dirs
#=======
# dirs
#=======
if not datadir:
datadir = homedir + '/git/Data/'
if not indir:
indir = datadir + drug + '/input/'
if not outdir:
outdir = datadir + drug + '/output/'
#if not mkdir_name:
outdir_dynamut = outdir + 'dynamut_results/'
outdir_dynamut2 = outdir + 'dynamut_results/dynamut2/'
# Input file
infile_dynamut = outdir_dynamut + gene + '_dynamut_all_output_clean.csv'
infile_dynamut2 = outdir_dynamut2 + gene + '_dynamut2_output_combined_clean.csv'
# Formatted output filename
outfile_dynamut_f = outdir_dynamut2 + gene + '_dynamut_norm.csv'
outfile_dynamut2_f = outdir_dynamut2 + gene + '_dynamut2_norm.csv'
#%%========================================================================
#===============================
# CALL: format_results_dynamut
# DYNAMUT results
# #===============================
# print('Formatting results for:', infile_dynamut)
# dynamut_df_f = format_dynamut_output(infile_dynamut)
# # writing file
# print('Writing formatted dynamut df to csv')
# dynamut_df_f.to_csv(outfile_dynamut_f, index = False)
# print('Finished writing file:'
# , '\nFile:', outfile_dynamut_f
# , '\nExpected no. of rows:', len(dynamut_df_f)
# , '\nExpected no. of cols:', len(dynamut_df_f.columns)
# , '\n=============================================================')
#===============================
# CALL: format_results_dynamut2
# DYNAMUT2 results
#===============================
print('Formatting results for:', infile_dynamut2)
dynamut2_df_f = format_dynamut2_output(infile_dynamut2) # dynamut2
# writing file
print('Writing formatted dynamut2 df to csv')
dynamut2_df_f.to_csv(outfile_dynamut2_f, index = False)
print('Finished writing file:'
, '\nFile:', outfile_dynamut2_f
, '\nExpected no. of rows:', len(dynamut2_df_f)
, '\nExpected no. of cols:', len(dynamut2_df_f.columns)
, '\n=============================================================')
#%%#####################################################################

View file

@ -0,0 +1,44 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 19 14:33:51 2020
@author: tanu
"""
#%% load packages
import os
homedir = os.path.expanduser('~')
os.chdir (homedir + '/git/LSHTM_analysis/dynamut')
from get_results_dynamut import *
########################################################################
# variables
my_host = 'http://biosig.unimelb.edu.au'
# Needed if things try to block the 'requests' user agent
#headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
# TODO: add cmd line args
#gene = ''
#drug = ''
datadir = homedir + '/git/Data/'
indir = datadir + drug + '/input/'
outdir = datadir + drug + '/output/'
outdir_dynamut_temp = outdir + 'dynamut_results/dynamut_temp/'
#==============================================================================
# batch 7 (previously 1b file): RETRIEVED 17 Aug 16:40
my_url_file = outdir_dynamut_temp + 'dynamut_result_url_gid_b7.txt'
my_suffix = 'gid_b7'
#==============================================================================
#==========================
# CALL: get_results()
# Data: gid+streptomycin
#==========================
# output file saves in dynamut_results/ (created if it doesn't exist) inside outdir
print('Fetching results from url file :', my_url_file, '\nsuffix:', my_suffix)
get_results(url_file = my_url_file
, host_url = my_host
, output_dir = outdir
, outfile_suffix = my_suffix)
########################################################################

58
dynamut/run_submit_dynamut.py Executable file
View file

@ -0,0 +1,58 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 19 14:33:51 2020
@author: tanu
"""
#%% load packages
import os
homedir = os.path.expanduser('~')
os.chdir (homedir + '/git/LSHTM_analysis/dynamut')
from submit_dynamut import *
########################################################################
# variables
my_host = 'http://biosig.unimelb.edu.au'
my_prediction_url = f"{my_host}/dynamut/prediction_list"
print(my_prediction_url)
# TODO: add cmd line args
gene = 'gid'
drug = 'streptomycin'
datadir = homedir + '/git/Data/'
indir = datadir + drug + '/input/'
outdir = datadir + drug + '/output/'
outdir_dynamut = outdir + 'dynamut_results/'
my_chain = 'A'
my_email = 'tanushree.tunstall@lshtm.ac.uk'
#my_pdb_file = indir + 'gid_complex.pdb'
my_pdb_file = indir + gene + '_complex.pdb'
#==============================================================================
# Rerunnig batch 7: 07.txt, # RAN: 12 Aug 15:22, 0 bytes file from previous run!
my_mutation_list = outdir + 'snp_batches/50/snp_batch_07.txt'
my_suffix = 'gid_b7'
#==============================================================================
#==========================
# CALL: submit_dynamut()
# Data: gid+streptomycin
#==========================
print('\nSubmitting batch for:'
, '\nFilename : ' , my_mutation_list
, '\nbatch : ' , my_suffix
, '\ndrug : ' , drug
, '\ngene : ' , gene
, '\npdb file : ' , my_pdb_file)
submit_dynamut(host_url = my_host
, pdb_file = my_pdb_file
, mutation_list = my_mutation_list
, chain = my_chain
, email_address = my_email
, prediction_url = my_prediction_url
, output_dir = outdir_dynamut
, outfile_suffix = my_suffix)
#%%#####################################################################

19
dynamut/split_csv.sh Executable file
View file

@ -0,0 +1,19 @@
#!/bin/bash
# FIXME: This is written for expediency to kickstart running dynamut and mcsm-NA
# Usage: ~/git/LSHTM_analysis/dynamut/split_csv.sh <input file> <output dir> <chunk size in lines>
# copy your snp file to split into the dynamut dir
INFILE=$1
OUTDIR=$2
CHUNK=$3
mkdir -p ${OUTDIR}/${CHUNK}
cd ${OUTDIR}/${CHUNK}
split ../../${INFILE} -l ${CHUNK} -d snp_batch_
# use case
#~/git/LSHTM_analysis/dynamut/split_csv.sh gid_mcsm_formatted_snps.csv snp_batches 50
#~/git/LSHTM_analysis/dynamut/split_csv.sh embb_mcsm_formatted_snps.csv snp_batches 50

89
dynamut/submit_dynamut.py Executable file
View file

@ -0,0 +1,89 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 19 14:33:51 2020
@author: tanu
"""
#%% load packages
import os,sys
import subprocess
import argparse
import requests
import re
import time
from bs4 import BeautifulSoup
import pandas as pd
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
#%%#####################################################################
def submit_dynamut(host_url
, pdb_file
, mutation_list
, chain
, email_address
, prediction_url
, output_dir
, outfile_suffix
):
"""
Makes a POST request for dynamut predictions.
@param host_url: valid host url for submitting the job
@type string
@param pdb_file: valid path to pdb structure
@type string
@param mutation_list: list of mutations (1 per line) of the format: {WT}<POS>{Mut}
@type string
@param chain: single-letter(caps)
@type chr
@param email_address: email address to inform of results
@type chr
@param prediction_url: dynamut url for prediction
@type string
@param output_dir: output dir
@type string
@param outfile_suffix: to append to outfile
@type string
@return writes a .txt file containing url for the snps processed with user provided suffix in filename
@type string
"""
with open(pdb_file, "rb") as pdb_file, open (mutation_list, "rb") as mutation_list:
files = {"wild": pdb_file
, "mutation_list": mutation_list}
body = {"chain": chain
, "email": email_address}
response = requests.post(prediction_url, files = files, data = body)
print(response.status_code)
if response.history:
print('\nPASS: valid submission. Fetching result url')
url_match = re.search('/dynamut/results_prediction/.+(?=")', response.text)
url = host_url + url_match.group()
print('\nURL for snp batch no ', str(outfile_suffix), ':', url)
#===============
# writing file: result urls
#===============
dynamut_temp_dir = output_dir + 'dynamut_temp/' # creates a temp dir within output_dir
if not os.path.exists(dynamut_temp_dir):
print('\nCreating dynamut_temp in output_dir', output_dir )
os.makedirs(dynamut_temp_dir)
out_url_file = dynamut_temp_dir + 'dynamut_result_url_' + str(outfile_suffix) + '.txt'
print('\nWriting output url file:', out_url_file
, '\nNow we wait patiently...')
myfile = open(out_url_file, 'a')
myfile.write(url)
myfile.close()
#%%#####################################################################

3
foldx/cmd_change Normal file
View file

@ -0,0 +1,3 @@
sed -i s/'\/Users\/Charlotte\/Downloads\/foldxMacC11\/' '\/home\/tanu\/git\/LSHTM_analysis\/foldx\/\/'/g *.sh
rm *.txt *.fxout *Repai*pdb

View file

@ -0,0 +1,68 @@
PDB=$1
n=$2
#cd /home/tanu/git/LSHTM_analysis/foldx/
logger "Running mutrenamefiles_mac"
cp Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout Matrix_Hbonds_${PDB}_Repair_${n}_PN.txt
sed -n '5,190p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_RR_${PDB}_Repair_${n}_PN.txt
sed -n '194,379p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_MM_${PDB}_Repair_${n}_PN.txt
sed -n '383,568p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_SM_${PDB}_Repair_${n}_PN.txt
sed -n '572,757p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_SS_${PDB}_Repair_${n}_PN.txt
cp Matrix_Distances_${PDB}_Repair_${n}_PN.fxout Matrix_Distances_${PDB}_Repair_${n}_PN.txt
sed -i .bak -e 1,4d Matrix_Distances_${PDB}_Repair_${n}_PN.txt
cp Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout Matrix_Volumetric_${PDB}_Repair_${n}_PN.txt
sed -n '5,190p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_RR_${PDB}_Repair_${n}_PN.txt
sed -n '194,379p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_MM_${PDB}_Repair_${n}_PN.txt
sed -n '383,568p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_SM_${PDB}_Repair_${n}_PN.txt
sed -n '572,757p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_SS_${PDB}_Repair_${n}_PN.txt
cp Matrix_Electro_${PDB}_Repair_${n}_PN.fxout Matrix_Electro_${PDB}_Repair_${n}_PN.txt
sed -n '5,190p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_RR_${PDB}_Repair_${n}_PN.txt
sed -n '194,379p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_MM_${PDB}_Repair_${n}_PN.txt
sed -n '383,568p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_SM_${PDB}_Repair_${n}_PN.txt
sed -n '572,757p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_SS_${PDB}_Repair_${n}_PN.txt
cp Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout Matrix_Disulfide_${PDB}_Repair_${n}_PN.txt
sed -n '5,190p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_RR_${PDB}_Repair_${n}_PN.txt
sed -n '194,379p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_MM_${PDB}_Repair_${n}_PN.txt
sed -n '383,568p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_SM_${PDB}_Repair_${n}_PN.txt
sed -n '572,757p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_SS_${PDB}_Repair_${n}_PN.txt
cp Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout Matrix_Partcov_${PDB}_Repair_${n}_PN.txt
sed -n '5,190p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_RR_${PDB}_Repair_${n}_PN.txt
sed -n '194,379p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_MM_${PDB}_Repair_${n}_PN.txt
sed -n '383,568p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_SM_${PDB}_Repair_${n}_PN.txt
sed -n '572,757p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_SS_${PDB}_Repair_${n}_PN.txt
cp Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout Matrix_VdWClashes_${PDB}_Repair_${n}_PN.txt
sed -n '5,190p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_RR_${PDB}_Repair_${n}_PN.txt
sed -n '194,379p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_MM_${PDB}_Repair_${n}_PN.txt
sed -n '383,568p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_SM_${PDB}_Repair_${n}_PN.txt
sed -n '572,757p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_SS_${PDB}_Repair_${n}_PN.txt
cp AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.fxout AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.txt
sed -i .bak -e 1,2d AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.txt
cp AllAtoms_Electro_${PDB}_Repair_${n}_PN.fxout AllAtoms_Electro_${PDB}_Repair_${n}_PN.txt
sed -i .bak -e 1,2d AllAtoms_Electro_${PDB}_Repair_${n}_PN.txt
cp AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.fxout AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.txt
sed -i .bak -e 1,2d AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.txt
cp AllAtoms_Partcov_${PDB}_Repair_${n}_PN.fxout AllAtoms_Partcov_${PDB}_Repair_${n}_PN.txt
sed -i .bak -e 1,2d AllAtoms_Partcov_${PDB}_Repair_${n}_PN.txt
cp AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.fxout AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.txt
sed -i .bak -e 1,2d AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.txt
cp AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.fxout AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.txt
sed -i .bak -e 1,2d AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.txt
cp InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.fxout InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.txt
sed -i .bak -e 1,5d InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.txt
cp InteractingResidues_Distances_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Distances_${PDB}_Repair_${n}_PN.txt
sed -i .bak -e 1,5d InteractingResidues_Distances_${PDB}_Repair_${n}_PN.txt
cp InteractingResidues_Electro_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Electro_${PDB}_Repair_${n}_PN.txt
sed -i .bak -e 1,5d InteractingResidues_Electro_${PDB}_Repair_${n}_PN.txt
cp InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.txt
sed -i .bak -e 1,5d InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.txt
cp InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.txt
sed -i .bak -e 1,5d InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.txt
cp InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.txt
sed -i .bak -e 1,5d InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.txt
cp InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.txt
sed -i .bak -e 1,5d InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.txt

View file

@ -0,0 +1,10 @@
PDB=$1
A=$2
B=$3
n=$4
OUTDIR=$5
cd ${OUTDIR}
logger "Running mutruncomplex"
foldx --command=AnalyseComplex --pdb="${PDB}_Repair_${n}.pdb" --analyseComplexChains=${A},${B} --water=PREDICT --vdwDesign=1
cp ${OUTDIR}/Summary_${PDB}_Repair_${n}_AC.fxout ${OUTDIR}/Summary_${PDB}_Repair_${n}_AC.txt
#sed -i .bak -e 1,8d ${OUTDIR}/Summary_${PDB}_Repair_${n}_AC.txt

View file

@ -0,0 +1,68 @@
PDB=$1
logger "Running renamefiles_mac"
#cp Dif_${PDB}_Repair.fxout Dif_${PDB}_Repair.txt
sed -i '.bak' -e 1,8d Dif_${PDB}_Repair.txt
cp Matrix_Hbonds_${PDB}_Repair_PN.fxout Matrix_Hbonds_${PDB}_Repair_PN.txt
sed -n '5,190p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_RR_${PDB}_Repair_PN.txt
sed -n '194,379p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_MM_${PDB}_Repair_PN.txt
sed -n '383,568p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_SM_${PDB}_Repair_PN.txt
sed -n '572,757p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_SS_${PDB}_Repair_PN.txt
cp Matrix_Distances_${PDB}_Repair_PN.fxout Matrix_Distances_${PDB}_Repair_PN.txt
sed -i '.bak' -e 1,4d Matrix_Distances_${PDB}_Repair_PN.txt
cp Matrix_Volumetric_${PDB}_Repair_PN.fxout Matrix_Volumetric_${PDB}_Repair_PN.txt
sed -n '5,190p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_RR_${PDB}_Repair_PN.txt
sed -n '194,379p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_MM_${PDB}_Repair_PN.txt
sed -n '383,568p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_SM_${PDB}_Repair_PN.txt
sed -n '572,757p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_SS_${PDB}_Repair_PN.txt
cp Matrix_Electro_${PDB}_Repair_PN.fxout Matrix_Electro_${PDB}_Repair_PN.txt
sed -n '5,190p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_RR_${PDB}_Repair_PN.txt
sed -n '194,379p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_MM_${PDB}_Repair_PN.txt
sed -n '383,568p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_SM_${PDB}_Repair_PN.txt
sed -n '572,757p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_SS_${PDB}_Repair_PN.txt
cp Matrix_Disulfide_${PDB}_Repair_PN.fxout Matrix_Disulfide_${PDB}_Repair_PN.txt
sed -n '5,190p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_RR_${PDB}_Repair_PN.txt
sed -n '194,379p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_MM_${PDB}_Repair_PN.txt
sed -n '383,568p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_SM_${PDB}_Repair_PN.txt
sed -n '572,757p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_SS_${PDB}_Repair_PN.txt
cp Matrix_Partcov_${PDB}_Repair_PN.fxout Matrix_Partcov_${PDB}_Repair_PN.txt
sed -n '5,190p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_RR_${PDB}_Repair_PN.txt
sed -n '194,379p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_MM_${PDB}_Repair_PN.txt
sed -n '383,568p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_SM_${PDB}_Repair_PN.txt
sed -n '572,757p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_SS_${PDB}_Repair_PN.txt
cp Matrix_VdWClashes_${PDB}_Repair_PN.fxout Matrix_VdWClashes_${PDB}_Repair_PN.txt
sed -n '5,190p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_RR_${PDB}_Repair_PN.txt
sed -n '194,379p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_MM_${PDB}_Repair_PN.txt
sed -n '383,568p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_SM_${PDB}_Repair_PN.txt
sed -n '572,757p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_SS_${PDB}_Repair_PN.txt
cp AllAtoms_Disulfide_${PDB}_Repair_PN.fxout AllAtoms_Disulfide_${PDB}_Repair_PN.txt
sed -i '.bak' -e 1,2d AllAtoms_Disulfide_${PDB}_Repair_PN.txt
cp AllAtoms_Electro_${PDB}_Repair_PN.fxout AllAtoms_Electro_${PDB}_Repair_PN.txt
sed -i '.bak' -e 1,2d AllAtoms_Electro_${PDB}_Repair_PN.txt
cp AllAtoms_Hbonds_${PDB}_Repair_PN.fxout AllAtoms_Hbonds_${PDB}_Repair_PN.txt
sed -i '.bak' -e 1,2d AllAtoms_Hbonds_${PDB}_Repair_PN.txt
cp AllAtoms_Partcov_${PDB}_Repair_PN.fxout AllAtoms_Partcov_${PDB}_Repair_PN.txt
sed -i '.bak' -e 1,2d AllAtoms_Partcov_${PDB}_Repair_PN.txt
cp AllAtoms_VdWClashes_${PDB}_Repair_PN.fxout AllAtoms_VdWClashes_${PDB}_Repair_PN.txt
sed -i '.bak' -e 1,2d AllAtoms_VdWClashes_${PDB}_Repair_PN.txt
cp AllAtoms_Volumetric_${PDB}_Repair_PN.fxout AllAtoms_Volumetric_${PDB}_Repair_PN.txt
sed -i '.bak' -e 1,2d AllAtoms_Volumetric_${PDB}_Repair_PN.txt
cp InteractingResidues_VdWClashes_${PDB}_Repair_PN.fxout InteractingResidues_VdWClashes_${PDB}_Repair_PN.txt
sed -i '.bak' -e 1,5d InteractingResidues_VdWClashes_${PDB}_Repair_PN.txt
cp InteractingResidues_Distances_${PDB}_Repair_PN.fxout InteractingResidues_Distances_${PDB}_Repair_PN.txt
sed -i '.bak' -e 1,5d InteractingResidues_Distances_${PDB}_Repair_PN.txt
cp InteractingResidues_Electro_${PDB}_Repair_PN.fxout InteractingResidues_Electro_${PDB}_Repair_PN.txt
sed -i '.bak' -e 1,5d InteractingResidues_Electro_${PDB}_Repair_PN.txt
cp InteractingResidues_Hbonds_${PDB}_Repair_PN.fxout InteractingResidues_Hbonds_${PDB}_Repair_PN.txt
sed -i '.bak' -e 1,5d InteractingResidues_Hbonds_${PDB}_Repair_PN.txt
cp InteractingResidues_Partcov_${PDB}_Repair_PN.fxout InteractingResidues_Partcov_${PDB}_Repair_PN.txt
sed -i '.bak' -e 1,5d InteractingResidues_Partcov_${PDB}_Repair_PN.txt
cp InteractingResidues_Volumetric_${PDB}_Repair_PN.fxout InteractingResidues_Volumetric_${PDB}_Repair_PN.txt
sed -i '.bak' -e 1,5d InteractingResidues_Volumetric_${PDB}_Repair_PN.txt
cp InteractingResidues_Disulfide_${PDB}_Repair_PN.fxout InteractingResidues_Disulfide_${PDB}_Repair_PN.txt
sed -i '.bak' -e 1,5d InteractingResidues_Disulfide_${PDB}_Repair_PN.txt

View file

@ -0,0 +1,9 @@
INDIR=$1
PDB=$2
OUTDIR=$3
logger "Running repairPDB"
#foldx --command=RepairPDB --pdb="${PDB}.pdb" --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 outPDB=true --output-dir=${OUTDIR}
foldx --command=RepairPDB --pdb-dir=${INDIR} --pdb=${PDB} --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 outPDB=true --output-dir=${OUTDIR}

View file

@ -0,0 +1,336 @@
#!/usr/bin/env python3
import subprocess
import os
import numpy as np
import pandas as pd
from contextlib import suppress
from pathlib import Path
import re
import csv
import argparse
#https://realpython.com/python-pathlib/
# FIXME
#strong dependency of file and path names
#cannot pass file with path. Need to pass them separately
#assumptions made for dir struc as standard
#datadir + drug + input
#=======================================================================
#%% specify input and curr dir
homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/foldx/')
os.getcwd()
#=======================================================================
#%% command line args
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = None)
arg_parser.add_argument('-g', '--gene', help = 'gene name (case sensitive)', default = None)
arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
arg_parser.add_argument('-p', '--process_dir', help = 'Temp processing dir for running foldX. By default, it assmes homedir + <drug> + processing. Make sure it is somewhere with LOTS of storage as it writes all output!') #FIXME
arg_parser.add_argument('-pdb', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called <gene>_complex.pdb in input_dir')
arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_mcsm_snps.csv exists')
# FIXME: Doesn't work with 2 chains yet!
arg_parser.add_argument('-c1', '--chain1', help = 'Chain1 ID', default = 'A') # case sensitive
arg_parser.add_argument('-c2', '--chain2', help = 'Chain2 ID', default = 'B') # case sensitive
args = arg_parser.parse_args()
#=======================================================================
#%% variable assignment: input and output
#drug = 'pyrazinamide'
#gene = 'pncA'
#gene_match = gene + '_p.'
#%%=====================================================================
# Command line options
drug = args.drug
gene = args.gene
datadir = args.datadir
indir = args.input_dir
outdir = args.output_dir
process_dir = args.process_dir
mut_filename = args.mutation_file
chainA = args.chain1
chainB = args.chain2
pdb_filename = args.pdb_file
# os.path.splitext will fail interestingly with file.pdb.txt.zip
#pdb_name = os.path.splitext(pdb_file)[0]
# Just the filename, thanks
#pdb_name = Path(in_filename_pdb).stem
#==============
# directories
#==============
if not datadir:
datadir = homedir + '/' + 'git/Data'
if not indir:
indir = datadir + '/' + drug + '/input'
if not outdir:
outdir = datadir + '/' + drug + '/output'
#TODO: perhaps better handled by refactoring code to prevent generating lots of output files!
if not process_dir:
process_dir = datadir + '/' + drug +'/' + 'processing'
#=======
# input
#=======
# FIXME
if pdb_filename:
pdb_name = Path(pdb_filename).stem
else:
pdb_filename = gene.lower() + '_complex.pdb'
pdb_name = Path(pdb_filename).stem
infile_pdb = indir + '/' + pdb_filename
actual_pdb_filename = Path(infile_pdb).name
if mut_filename:
mutation_file = mut_filename
else:
mutation_file = gene.lower() + '_mcsm_formatted_snps.csv'
infile_muts = outdir + '/' + mutation_file
#=======
# output
#=======
out_filename = gene.lower() + '_foldx.csv'
outfile_foldx = outdir + '/' + out_filename
print('Arguments being passed:'
, '\nDrug:', args.drug
, '\ngene:', args.gene
, '\ninput dir:', indir
, '\noutput dir:', outdir
, '\npdb file:', infile_pdb
, '\npdb name:', pdb_name
, '\nactual pdb name:', actual_pdb_filename
, '\nmutation file:', infile_muts
, '\nchain1:', args.chain1
, '\noutput file:', outfile_foldx
, '\n=============================================================')
#=======================================================================
def getInteractionEnergy(filename):
data = pd.read_csv(filename,sep = '\t')
return data['Interaction Energy'].loc[0]
def getInteractions(filename):
data = pd.read_csv(filename, index_col = 0, header = 0, sep = '\t')
contactList = getIndexes(data,1)
number = len(contactList)
return number
def formatMuts(mut_file,pdbname):
with open(mut_file) as csvfile:
readCSV = csv.reader(csvfile)
muts = []
for row in readCSV:
mut = row[0]
muts.append(mut)
mut_list = []
outfile = process_dir + '/individual_list_' + pdbname + '.txt'
with open(outfile, 'w') as output:
for m in muts:
print(m)
mut = m[:1] + chainA+ m[1:]
mut_list.append(mut)
mut = mut + ';'
print(mut)
output.write(mut)
output.write('\n')
return mut_list
def getIndexes(data, value):
colnames = data.columns.values
listOfPos = list()
result = data.isin([value])
result.columns = colnames
seriesdata = result.any()
columnNames = list(seriesdata[seriesdata==True].index)
for col in columnNames:
rows = list(result[col][result[col]==True].index)
for row in rows:
listOfPos.append((row,col))
return listOfPos
def loadFiles(df):
# load a text file in to np matrix
resultList = []
f = open(df,'r')
for line in f:
line = line.rstrip('\n')
aVals = line.split('\t')
fVals = list(map(np.float32, sVals))
resultList.append(fVals)
f.close()
return np.asarray(resultList, dtype=np.float32)
#=======================================================================
def main():
pdbname = pdb_name
comp = '' # for complex only
mut_filename = infile_muts #pnca_mcsm_snps.csv
mutlist = formatMuts(mut_filename, pdbname)
print(mutlist)
nmuts = len(mutlist)
print(nmuts)
print(mutlist)
print('start')
#subprocess.check_output(['bash','repairPDB.sh', pdbname, process_dir])
subprocess.check_output(['bash','repairPDB.sh', indir, actual_pdb_filename, process_dir])
print('end')
output = subprocess.check_output(['bash', 'runfoldx.sh', pdbname, process_dir])
for n in range(1,nmuts+1):
print(n)
with suppress(Exception):
subprocess.check_output(['bash', 'runPrintNetworks.sh', pdbname, str(n), process_dir])
for n in range(1,nmuts+1):
print(n)
with suppress(Exception):
subprocess.check_output(['bash', 'mutrenamefiles.sh', pdbname, str(n), process_dir])
out = subprocess.check_output(['bash','renamefiles.sh', pdbname, process_dir])
if comp=='y':
chain1=chainA
chain2=chainB
with suppress(Exception):
subprocess.check_output(['bash','runcomplex.sh', pdbname, chain1, chain2, process_dir])
for n in range(1,nmuts+1):
with suppress(Exception):
subprocess.check_output(['bash','mutruncomplex.sh', pdbname, chain1, chain2, str(n), process_dir])
interactions = ['Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS',
'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM',
'VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']
dGdatafile = process_dir + '/Dif_' + pdbname + '_Repair.txt'
dGdata = pd.read_csv(dGdatafile, sep = '\t')
ddG=[]
print('ddG')
print(len(dGdata))
for i in range(0,len(dGdata)):
ddG.append(dGdata['total energy'].loc[i])
nint = len(interactions)
wt_int = []
for i in interactions:
filename = process_dir + '/Matrix_' + i + '_'+ pdbname + '_Repair_PN.txt'
wt_int.append(getInteractions(filename))
print('wt')
print(wt_int)
ntotal = nint+1
print(ntotal)
print(nmuts)
data = np.empty((ntotal,nmuts))
data[0] = ddG
print(data)
for i in range(0,len(interactions)):
d=[]
p=0
for n in range(1, nmuts+1):
print(i)
filename = process_dir + '/Matrix_' + interactions[i] + '_' + pdbname + '_Repair_' + str(n) + '_PN.txt'
mut = getInteractions(filename)
diff = wt_int[i] - mut
print(diff)
print(wt_int[i])
print(mut)
d.append(diff)
print(d)
data[i+1] = d
interactions = ['ddG', 'Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM',
'VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']
print(interactions)
IE = []
if comp=='y':
wtfilename = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
wtE = getInteractionEnergy(wtfilename)
print(wtE)
for n in range(1,nmuts+1):
print(n)
filename = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) + '_AC.txt'
mutE = getInteractionEnergy(filename)
print(mutE)
diff = wtE - mutE
print(diff)
IE.append(diff)
print(IE)
IEresults = pd.DataFrame(IE,columns = ['Interaction Energy'], index = mutlist)
IEfilename = 'foldx_complexresults_'+pdbname+'.csv'
IEresults.to_csv(IEfilename)
print(len(IE))
data = np.append(data,[IE], axis = 0)
print(data)
interactions = ['ddG','Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM',
'VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS','Interaction Energy']
mut_file = process_dir + '/individual_list_' + pdbname + '.txt'
with open(mut_file) as csvfile:
readCSV = csv.reader(csvfile)
mutlist = []
for row in readCSV:
mut = row[0]
mutlist.append(mut)
print(mutlist)
print(len(mutlist))
print(data)
results = pd.DataFrame(data, columns = mutlist, index = interactions)
results.append(ddG)
#print(results.head())
# my style formatted results
results2 = results.T # transpose df
results2.index.name = 'mutationinformation' # assign name to index
results2 = results2.reset_index() # turn it into a columns
results2['mutationinformation'] = results2['mutationinformation'].replace({r'([A-Z]{1})[A-Z]{1}([0-9]+[A-Z]{1});' : r'\1 \2'}, regex = True) # capture mcsm style muts (i.e not the chain id)
results2['mutationinformation'] = results2['mutationinformation'].str.replace(' ', '') # remove empty space
results2.rename(columns = {'Distances': 'Contacts'}, inplace = True)
# lower case columns
results2.columns = results2.columns.str.lower()
print('Writing file in the format below:\n'
, results2.head()
, '\nNo. of rows:', len(results2)
, '\nNo. of cols:', len(results2.columns))
outputfilename = outfile_foldx
#outputfilename = 'foldx_results_' + pdbname + '.csv'
#results.to_csv(outputfilename)
results2.to_csv(outputfilename, index = False)
if __name__ == '__main__':
main()

View file

@ -0,0 +1,7 @@
PDB=$1
n=$2
OUTDIR=$3
logger "Running runPrintNetworks"
cd ${OUTDIR}
foldx --command=PrintNetworks --pdb="${PDB}_Repair_${n}.pdb" --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}

View file

@ -0,0 +1,9 @@
PDB=$1
A=$2
B=$3
OUTDIR=$4
cd ${OUTDIR}
logger "Running runcomplex"
foldx --command=AnalyseComplex --pdb="${PDB}_Repair.pdb" --analyseComplexChains=${A},${B} --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}
cp ${OUTDIR}/Summary_${PDB}_Repair_AC.fxout ${OUTDIR}/Summary_${PDB}_Repair_AC.txt
#sed -i .bak -e 1,8d ${OUTDIR}/Summary_${PDB}_Repair_AC.txt

View file

@ -0,0 +1,9 @@
PDB=$1
OUTDIR=$2
cd ${OUTDIR}
pwd
ls
logger "Running runfoldx"
foldx --command=BuildModel --pdb="${PDB}_Repair.pdb" --mutant-file="individual_list_${PDB}.txt" --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 --out-pdb=true --numberOfRuns=1 --output-dir=${OUTDIR}
foldx --command=PrintNetworks --pdb="${PDB}_Repair.pdb" --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}
foldx --command=SequenceDetail --pdb="${PDB}_Repair.pdb" --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}

63
foldx/mutrenamefiles.sh Executable file
View file

@ -0,0 +1,63 @@
PDB=$1
n=$2
OUTDIR=$3
cd ${OUTDIR}
cp Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout Matrix_Hbonds_${PDB}_Repair_${n}_PN.txt
sed -n '5,190p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_RR_${PDB}_Repair_${n}_PN.txt
sed -n '194,379p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_MM_${PDB}_Repair_${n}_PN.txt
sed -n '383,568p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_SM_${PDB}_Repair_${n}_PN.txt
sed -n '572,757p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_SS_${PDB}_Repair_${n}_PN.txt
cp Matrix_Distances_${PDB}_Repair_${n}_PN.fxout Matrix_Distances_${PDB}_Repair_${n}_PN.txt
sed -i '1,4d' Matrix_Distances_${PDB}_Repair_${n}_PN.txt
cp Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout Matrix_Volumetric_${PDB}_Repair_${n}_PN.txt
sed -n '5,190p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_RR_${PDB}_Repair_${n}_PN.txt
sed -n '194,379p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_MM_${PDB}_Repair_${n}_PN.txt
sed -n '383,568p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_SM_${PDB}_Repair_${n}_PN.txt
sed -n '572,757p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_SS_${PDB}_Repair_${n}_PN.txt
cp Matrix_Electro_${PDB}_Repair_${n}_PN.fxout Matrix_Electro_${PDB}_Repair_${n}_PN.txt
sed -n '5,190p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_RR_${PDB}_Repair_${n}_PN.txt
sed -n '194,379p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_MM_${PDB}_Repair_${n}_PN.txt
sed -n '383,568p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_SM_${PDB}_Repair_${n}_PN.txt
sed -n '572,757p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_SS_${PDB}_Repair_${n}_PN.txt
cp Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout Matrix_Disulfide_${PDB}_Repair_${n}_PN.txt
sed -n '5,190p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_RR_${PDB}_Repair_${n}_PN.txt
sed -n '194,379p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_MM_${PDB}_Repair_${n}_PN.txt
sed -n '383,568p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_SM_${PDB}_Repair_${n}_PN.txt
sed -n '572,757p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_SS_${PDB}_Repair_${n}_PN.txt
cp Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout Matrix_Partcov_${PDB}_Repair_${n}_PN.txt
sed -n '5,190p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_RR_${PDB}_Repair_${n}_PN.txt
sed -n '194,379p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_MM_${PDB}_Repair_${n}_PN.txt
sed -n '383,568p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_SM_${PDB}_Repair_${n}_PN.txt
sed -n '572,757p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_SS_${PDB}_Repair_${n}_PN.txt
cp Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout Matrix_VdWClashes_${PDB}_Repair_${n}_PN.txt
sed -n '5,190p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_RR_${PDB}_Repair_${n}_PN.txt
sed -n '194,379p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_MM_${PDB}_Repair_${n}_PN.txt
sed -n '383,568p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_SM_${PDB}_Repair_${n}_PN.txt
sed -n '572,757p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_SS_${PDB}_Repair_${n}_PN.txt
cp AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.fxout AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.txt
sed -i '1,2d' AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.txt
cp AllAtoms_Electro_${PDB}_Repair_${n}_PN.fxout AllAtoms_Electro_${PDB}_Repair_${n}_PN.txt
sed -i '1,2d' AllAtoms_Electro_${PDB}_Repair_${n}_PN.txt
cp AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.fxout AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.txt
sed -i '1,2d' AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.txt
cp AllAtoms_Partcov_${PDB}_Repair_${n}_PN.fxout AllAtoms_Partcov_${PDB}_Repair_${n}_PN.txt
sed -i '1,2d' AllAtoms_Partcov_${PDB}_Repair_${n}_PN.txt
cp AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.fxout AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.txt
sed -i '1,2d' AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.txt
cp AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.fxout AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.txt
sed -i '1,2d' AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.txt
cp InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.fxout InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.txt
sed -i '1,5d' InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.txt
cp InteractingResidues_Distances_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Distances_${PDB}_Repair_${n}_PN.txt
sed -i '1,5d' InteractingResidues_Distances_${PDB}_Repair_${n}_PN.txt
cp InteractingResidues_Electro_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Electro_${PDB}_Repair_${n}_PN.txt
sed -i '1,5d' InteractingResidues_Electro_${PDB}_Repair_${n}_PN.txt
cp InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.txt
sed -i '1,5d' InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.txt
cp InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.txt
sed -i '1,5d' InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.txt
cp InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.txt
sed -i '1,5d' InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.txt
cp InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.txt
sed -i '1,5d' InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.txt

64
foldx/renamefiles.sh Executable file
View file

@ -0,0 +1,64 @@
PDB=$1
OUTDIR=$2
cd ${OUTDIR}
cp Dif_${PDB}_Repair.fxout Dif_${PDB}_Repair.txt
sed -i '1,8d' Dif_${PDB}_Repair.txt
cp Matrix_Hbonds_${PDB}_Repair_PN.fxout Matrix_Hbonds_${PDB}_Repair_PN.txt
sed -n '5,190p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_RR_${PDB}_Repair_PN.txt
sed -n '194,379p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_MM_${PDB}_Repair_PN.txt
sed -n '383,568p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_SM_${PDB}_Repair_PN.txt
sed -n '572,757p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_SS_${PDB}_Repair_PN.txt
cp Matrix_Distances_${PDB}_Repair_PN.fxout Matrix_Distances_${PDB}_Repair_PN.txt
sed -i '1,4d' Matrix_Distances_${PDB}_Repair_PN.txt
cp Matrix_Volumetric_${PDB}_Repair_PN.fxout Matrix_Volumetric_${PDB}_Repair_PN.txt
sed -n '5,190p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_RR_${PDB}_Repair_PN.txt
sed -n '194,379p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_MM_${PDB}_Repair_PN.txt
sed -n '383,568p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_SM_${PDB}_Repair_PN.txt
sed -n '572,757p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_SS_${PDB}_Repair_PN.txt
cp Matrix_Electro_${PDB}_Repair_PN.fxout Matrix_Electro_${PDB}_Repair_PN.txt
sed -n '5,190p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_RR_${PDB}_Repair_PN.txt
sed -n '194,379p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_MM_${PDB}_Repair_PN.txt
sed -n '383,568p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_SM_${PDB}_Repair_PN.txt
sed -n '572,757p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_SS_${PDB}_Repair_PN.txt
cp Matrix_Disulfide_${PDB}_Repair_PN.fxout Matrix_Disulfide_${PDB}_Repair_PN.txt
sed -n '5,190p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_RR_${PDB}_Repair_PN.txt
sed -n '194,379p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_MM_${PDB}_Repair_PN.txt
sed -n '383,568p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_SM_${PDB}_Repair_PN.txt
sed -n '572,757p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_SS_${PDB}_Repair_PN.txt
cp Matrix_Partcov_${PDB}_Repair_PN.fxout Matrix_Partcov_${PDB}_Repair_PN.txt
sed -n '5,190p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_RR_${PDB}_Repair_PN.txt
sed -n '194,379p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_MM_${PDB}_Repair_PN.txt
sed -n '383,568p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_SM_${PDB}_Repair_PN.txt
sed -n '572,757p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_SS_${PDB}_Repair_PN.txt
cp Matrix_VdWClashes_${PDB}_Repair_PN.fxout Matrix_VdWClashes_${PDB}_Repair_PN.txt
sed -n '5,190p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_RR_${PDB}_Repair_PN.txt
sed -n '194,379p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_MM_${PDB}_Repair_PN.txt
sed -n '383,568p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_SM_${PDB}_Repair_PN.txt
sed -n '572,757p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_SS_${PDB}_Repair_PN.txt
cp AllAtoms_Disulfide_${PDB}_Repair_PN.fxout AllAtoms_Disulfide_${PDB}_Repair_PN.txt
sed -i '1,2d' AllAtoms_Disulfide_${PDB}_Repair_PN.txt
cp AllAtoms_Electro_${PDB}_Repair_PN.fxout AllAtoms_Electro_${PDB}_Repair_PN.txt
sed -i '1,2d' AllAtoms_Electro_${PDB}_Repair_PN.txt
cp AllAtoms_Hbonds_${PDB}_Repair_PN.fxout AllAtoms_Hbonds_${PDB}_Repair_PN.txt
sed -i '1,2d' AllAtoms_Hbonds_${PDB}_Repair_PN.txt
cp AllAtoms_Partcov_${PDB}_Repair_PN.fxout AllAtoms_Partcov_${PDB}_Repair_PN.txt
sed -i '1,2d' AllAtoms_Partcov_${PDB}_Repair_PN.txt
cp AllAtoms_VdWClashes_${PDB}_Repair_PN.fxout AllAtoms_VdWClashes_${PDB}_Repair_PN.txt
sed -i '1,2d' AllAtoms_VdWClashes_${PDB}_Repair_PN.txt
cp AllAtoms_Volumetric_${PDB}_Repair_PN.fxout AllAtoms_Volumetric_${PDB}_Repair_PN.txt
sed -i '1,2d' AllAtoms_Volumetric_${PDB}_Repair_PN.txt
cp InteractingResidues_VdWClashes_${PDB}_Repair_PN.fxout InteractingResidues_VdWClashes_${PDB}_Repair_PN.txt
sed -i '1,5d' InteractingResidues_VdWClashes_${PDB}_Repair_PN.txt
cp InteractingResidues_Distances_${PDB}_Repair_PN.fxout InteractingResidues_Distances_${PDB}_Repair_PN.txt
sed -i '1,5d' InteractingResidues_Distances_${PDB}_Repair_PN.txt
cp InteractingResidues_Electro_${PDB}_Repair_PN.fxout InteractingResidues_Electro_${PDB}_Repair_PN.txt
sed -i '1,5d' InteractingResidues_Electro_${PDB}_Repair_PN.txt
cp InteractingResidues_Hbonds_${PDB}_Repair_PN.fxout InteractingResidues_Hbonds_${PDB}_Repair_PN.txt
sed -i '1,5d' InteractingResidues_Hbonds_${PDB}_Repair_PN.txt
cp InteractingResidues_Partcov_${PDB}_Repair_PN.fxout InteractingResidues_Partcov_${PDB}_Repair_PN.txt
sed -i '1,5d' InteractingResidues_Partcov_${PDB}_Repair_PN.txt
cp InteractingResidues_Volumetric_${PDB}_Repair_PN.fxout InteractingResidues_Volumetric_${PDB}_Repair_PN.txt
sed -i '1,5d' InteractingResidues_Volumetric_${PDB}_Repair_PN.txt
cp InteractingResidues_Disulfide_${PDB}_Repair_PN.fxout InteractingResidues_Disulfide_${PDB}_Repair_PN.txt
sed -i '1,5d' InteractingResidues_Disulfide_${PDB}_Repair_PN.txt

239965
foldx/rotabase.txt Normal file

File diff suppressed because it is too large Load diff

466
foldx/runFoldx.py Executable file
View file

@ -0,0 +1,466 @@
#!/usr/bin/env python3
import subprocess
import os
import sys
import numpy as np
import pandas as pd
from contextlib import suppress
from pathlib import Path
import re
import csv
import argparse
import shutil
import time
#https://realpython.com/python-pathlib/
# FIXME
#strong dependency of file and path names
#cannot pass file with path. Need to pass them separately
#assumptions made for dir struc as standard
#datadir + drug + input
#=======================================================================
#%% specify input and curr dir
homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
#os.chdir(homedir + '/git/LSHTM_analysis/foldx/')
#os.getcwd()
#=======================================================================
#%% command line args
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = None)
arg_parser.add_argument('-g', '--gene', help = 'gene name (case sensitive)', default = None)
arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
arg_parser.add_argument('-p', '--process_dir', help = 'Temp processing dir for running foldX. By default, it assmes homedir + <drug> + processing. Make sure it is somewhere with LOTS of storage as it writes all output!') #FIXME
arg_parser.add_argument('-P', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called <gene>_complex.pdb in input_dir')
arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_mcsm_snps.csv exists')
# FIXME: Doesn't work with 2 chains yet!
arg_parser.add_argument('-c1', '--chain1', help = 'Chain1 ID', default = 'A') # case sensitive
arg_parser.add_argument('-c2', '--chain2', help = 'Chain2 ID', default = 'B') # case sensitive
args = arg_parser.parse_args()
#=======================================================================
#%% variable assignment: input and output
#drug = 'pyrazinamide'
#gene = 'pncA'
#gene_match = gene + '_p.'
#%%=====================================================================
# Command line options
drug = args.drug
gene = args.gene
datadir = args.datadir
indir = args.input_dir
outdir = args.output_dir
process_dir = args.process_dir
mut_filename = args.mutation_file
chainA = args.chain1
chainB = args.chain2
pdb_filename = args.pdb_file
# os.path.splitext will fail interestingly with file.pdb.txt.zip
#pdb_name = os.path.splitext(pdb_file)[0]
# Just the filename, thanks
#pdb_name = Path(in_filename_pdb).stem
# Handle the case where neither 'drug'
# nor (indir,outdir,process_dir) are defined
if not drug:
if not indir or not outdir or not process_dir:
print('ERROR: if "drug" is not specified, you must specify Input, Output, and Process directories')
sys.exit()
#==============
# directories
#==============
if not datadir:
datadir = homedir + '/' + 'git/Data'
if not indir:
indir = datadir + '/' + drug + '/input'
if not outdir:
outdir = datadir + '/' + drug + '/output'
#TODO: perhaps better handled by refactoring code to prevent generating lots of output files!
if not process_dir:
process_dir = datadir + '/' + drug + '/processing'
# Make all paths absolute in case the user forgot
indir = os.path.abspath(indir)
process_dir = os.path.abspath(process_dir)
outdir = os.path.abspath(outdir)
datadir = os.path.abspath(datadir)
#=======
# input
#=======
# FIXME
if pdb_filename:
pdb_filename = os.path.abspath(pdb_filename)
pdb_name = Path(pdb_filename).stem
infile_pdb = pdb_filename
else:
pdb_filename = gene.lower() + '_complex.pdb'
pdb_name = Path(pdb_filename).stem
infile_pdb = indir + '/' + pdb_filename
actual_pdb_filename = Path(infile_pdb).name
if mut_filename:
mutation_file = os.path.abspath(mut_filename)
infile_muts = mutation_file
print('User-provided mutation file in use:', infile_muts)
else:
mutation_file = gene.lower() + '_mcsm_formatted_snps.csv'
infile_muts = outdir + '/' + mutation_file
print('WARNING: Assuming default mutation file:', infile_muts)
#=======
# output
#=======
out_filename = gene.lower() + '_foldx.csv'
outfile_foldx = outdir + '/' + out_filename
print('Arguments being passed:'
, '\nDrug:', args.drug
, '\ngene:', args.gene
, '\ninput dir:', indir
, '\nprocess dir:', process_dir
, '\noutput dir:', outdir
, '\npdb file:', infile_pdb
, '\npdb name:', pdb_name
, '\nactual pdb name:', actual_pdb_filename
, '\nmutation file:', infile_muts
, '\nchain1:', args.chain1
, '\noutput file:', outfile_foldx
, '\n=============================================================')
#### Delay for 10 seconds to check the params ####
print('Sleeping for 10 seconds to give you time to cancel')
time.sleep(10)
#=======================================================================
def getInteractionEnergy(filename):
data = pd.read_csv(filename,sep = '\t')
return data['Interaction Energy'].loc[0]
def getInteractions(filename):
data = pd.read_csv(filename, index_col = 0, header = 0, sep = '\t')
contactList = getIndexes(data,1)
number = len(contactList)
return number
def formatMuts(mut_file,pdbname):
with open(mut_file) as csvfile:
readCSV = csv.reader(csvfile)
muts = []
for row in readCSV:
mut = row[0]
muts.append(mut)
mut_list = []
outfile = process_dir + '/individual_list_' + pdbname + '.txt'
with open(outfile, 'w') as output:
for m in muts:
print(m)
mut = m[:1] + chainA+ m[1:]
mut_list.append(mut)
mut = mut + ';'
print(mut)
output.write(mut)
output.write('\n')
return mut_list
def getIndexes(data, value):
colnames = data.columns.values
listOfPos = list()
result = data.isin([value])
result.columns = colnames
seriesdata = result.any()
columnNames = list(seriesdata[seriesdata==True].index)
for col in columnNames:
rows = list(result[col][result[col]==True].index)
for row in rows:
listOfPos.append((row,col))
return listOfPos
def loadFiles(df):
# load a text file in to np matrix
resultList = []
f = open(df,'r')
for line in f:
line = line.rstrip('\n')
aVals = line.split('\t')
fVals = list(map(np.float32, sVals))
resultList.append(fVals)
f.close()
return np.asarray(resultList, dtype=np.float32)
# TODO: put the subprocess call in a 'def'
#def repairPDB():
# subprocess.call(['foldx'
# , '--command=RepairPDB'
# , '--pdb-dir=' + indir
# , '--pdb=' + actual_pdb_filename
# , '--ionStrength=0.05'#
# , '--pH=7'
# , '--water=PREDICT'
# , '--vdwDesign=1'
# , 'outPDB=true'
# , '--output-dir=' + process_dir])
#=======================================================================
def main():
pdbname = pdb_name
comp = '' # for complex only
mut_filename = infile_muts #pnca_mcsm_snps.csv
mutlist = formatMuts(mut_filename, pdbname)
print(mutlist)
nmuts = len(mutlist)
print(nmuts)
print(mutlist)
print('start')
# some common parameters for foldX
foldx_common=' --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 '
print('\033[95mSTAGE: repair PDB (foldx subprocess) \033[0m')
print('Running foldx RepairPDB for WT')
subprocess.call(['foldx'
, '--command=RepairPDB'
, foldx_common
, '--pdb-dir=' + os.path.dirname(pdb_filename)
, '--pdb=' + actual_pdb_filename
, 'outPDB=true'
, '--output-dir=' + process_dir])
print('\033[95mCOMPLETED STAGE: repair PDB\033[0m')
print('\n==========================================================')
print('\033[95mSTAGE: Foldx commands BM, PN and SD (foldx subprocess) for WT\033[0m')
print('Running foldx BuildModel for WT')
subprocess.call(['foldx'
, '--command=BuildModel'
, foldx_common
, '--pdb-dir=' + process_dir
, '--pdb=' + pdbname + '_Repair.pdb'
, '--mutant-file="individual_list_' + pdbname +'.txt"'
, 'outPDB=true'
, '--numberOfRuns=1'
, '--output-dir=' + process_dir], cwd=process_dir)
print('Running foldx PrintNetworks for WT')
subprocess.call(['foldx'
, '--command=PrintNetworks'
, '--pdb-dir=' + process_dir
, '--pdb=' + pdbname + '_Repair.pdb'
, '--water=PREDICT'
, '--vdwDesign=1'
, '--output-dir=' + process_dir], cwd=process_dir)
print('Running foldx SequenceDetail for WT')
subprocess.call(['foldx'
, '--command=SequenceDetail'
, '--pdb-dir=' + process_dir
, '--pdb=' + pdbname + '_Repair.pdb'
, '--water=PREDICT'
, '--vdwDesign=1'
, '--output-dir=' + process_dir], cwd=process_dir)
print('\033[95mCOMPLETED STAGE: Foldx commands BM, PN and SD\033[0m')
print('\n==========================================================')
print('\033[95mSTAGE: Print Networks (foldx subprocess) for MT\033[0m')
for n in range(1,nmuts+1):
print('\033[95mNETWORK:\033[0m', n)
print('Running foldx PrintNetworks for mutation', n)
subprocess.call(['foldx'
, '--command=PrintNetworks'
, '--pdb-dir=' + process_dir
, '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
, '--water=PREDICT'
, '--vdwDesign=1'
, '--output-dir=' + process_dir], cwd=process_dir)
print('\033[95mCOMPLETED STAGE: Print Networks (foldx subprocess) for MT\033[0m')
print('\n==========================================================')
print('\033[95mSTAGE: Rename Mutation Files (shell)\033[0m')
for n in range(1,nmuts+1):
print('\033[95mMUTATION:\033[0m', n)
print('\033[96mCommand:\033[0m mutrenamefiles.sh %s %s %s' % (pdbname, str(n), process_dir ))
#FIXME: bad design and needs to be done in a pythonic way
with suppress(Exception):
subprocess.check_output(['bash', 'mutrenamefiles.sh', pdbname, str(n), process_dir])
print('\033[95mCOMPLETED STAGE: Rename Mutation Files (shell)\033[0m')
print('\n==========================================================')
print('\033[95mSTAGE: Rename Files (shell) for WT\033[0m')
# FIXME: this is bad design and needs to be done in a pythonic way
out = subprocess.check_output(['bash','renamefiles.sh', pdbname, process_dir])
print('\033[95mCOMPLETED STAGE: Rename Files (shell) for WT\033[0m')
print('\n==========================================================')
if comp=='y':
print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for WT\033[0m')
chain1=chainA
chain2=chainB
subprocess.call(['foldx'
, '--command=AnalyseComplex'
, '--pdb-dir=' + process_dir
, '--pdb=' + pdbname + '_Repair.pdb'
, '--analyseComplexChains=' + chain1 + ',' + chain2
, '--water=PREDICT'
, '--vdwDesign=1'
, '--output-dir=' + process_dir], cwd=process_dir)
# FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
ac_source = process_dir + '/Summary_' + pdbname + '_Repair_AC.fxout'
ac_dest = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
shutil.copyfile(ac_source, ac_dest)
print('\033[95mCOMPLETED STAGE: foldx AnalyseComplex (subprocess) for WT:\033[0m', n)
for n in range(1,nmuts+1):
print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for mutation:\033[0m', n)
subprocess.call(['foldx'
, '--command=AnalyseComplex'
, '--pdb-dir=' + process_dir
, '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
, '--analyseComplexChains=' + chain1 + ',' + chain2
, '--water=PREDICT'
, '--vdwDesign=1'
, '--output-dir=' + process_dir], cwd=process_dir)
# FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
ac_mut_source = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) +'_AC.fxout'
ac_mut_dest = process_dir + '/Summary_' + pdbname + '_Repair)' + str(n) +'_AC.txt'
shutil.copyfile(ac_mut_source, ac_mut_dest)
print('\033[95mCOMPLETED STAGE: foldx AnalyseComplex (subprocess) for mutation:\033[0m', n)
print('\n==========================================================')
interactions = ['Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']
dGdatafile = process_dir + '/Dif_' + pdbname + '_Repair.txt'
dGdata = pd.read_csv(dGdatafile, sep = '\t')
ddG=[]
print('ddG')
print(len(dGdata))
for i in range(0,len(dGdata)):
ddG.append(dGdata['total energy'].loc[i])
nint = len(interactions)
wt_int = []
for i in interactions:
filename = process_dir + '/Matrix_' + i + '_'+ pdbname + '_Repair_PN.txt'
wt_int.append(getInteractions(filename))
print('wt')
print(wt_int)
ntotal = nint+1
print(ntotal)
print(nmuts)
data = np.empty((ntotal,nmuts))
data[0] = ddG
print(data)
for i in range(0,len(interactions)):
d=[]
p=0
for n in range(1, nmuts+1):
print(i)
filename = process_dir + '/Matrix_' + interactions[i] + '_' + pdbname + '_Repair_' + str(n) + '_PN.txt'
mut = getInteractions(filename)
diff = wt_int[i] - mut
print(diff)
print(wt_int[i])
print(mut)
d.append(diff)
print(d)
data[i+1] = d
interactions = ['ddG', 'Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']
print(interactions)
IE = []
if comp=='y':
wtfilename = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
wtE = getInteractionEnergy(wtfilename)
print(wtE)
for n in range(1,nmuts+1):
print(n)
filename = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) + '_AC.txt'
mutE = getInteractionEnergy(filename)
print(mutE)
diff = wtE - mutE
print(diff)
IE.append(diff)
print(IE)
IEresults = pd.DataFrame(IE,columns = ['Interaction Energy'], index = mutlist)
IEfilename = 'foldx_complexresults_'+pdbname+'.csv'
IEresults.to_csv(IEfilename)
print(len(IE))
data = np.append(data,[IE], axis = 0)
print(data)
interactions = ['ddG','Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS','Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS','Interaction Energy']
mut_file = process_dir + '/individual_list_' + pdbname + '.txt'
with open(mut_file) as csvfile:
readCSV = csv.reader(csvfile)
mutlist = []
for row in readCSV:
mut = row[0]
mutlist.append(mut)
print(mutlist)
print(len(mutlist))
print(data)
results = pd.DataFrame(data, columns = mutlist, index = interactions)
results.append(ddG)
#print(results.head())
# my style formatted results
results2 = results.T # transpose df
results2.index.name = 'mutationinformation' # assign name to index
results2 = results2.reset_index() # turn it into a columns
results2['mutationinformation'] = results2['mutationinformation'].replace({r'([A-Z]{1})[A-Z]{1}([0-9]+[A-Z]{1});' : r'\1 \2'}, regex = True) # capture mcsm style muts (i.e not the chain id)
results2['mutationinformation'] = results2['mutationinformation'].str.replace(' ', '') # remove empty space
results2.rename(columns = {'Distances': 'Contacts'}, inplace = True)
# lower case columns
results2.columns = results2.columns.str.lower()
print('Writing file in the format below:\n'
, results2.head()
, '\nNo. of rows:', len(results2)
, '\nNo. of cols:', len(results2.columns))
outputfilename = outfile_foldx
#outputfilename = 'foldx_results_' + pdbname + '.csv'
#results.to_csv(outputfilename)
results2.to_csv(outputfilename, index = False)
print ('end')
if __name__ == '__main__':
main()

466
foldx/runFoldx5.py Executable file
View file

@ -0,0 +1,466 @@
#!/usr/bin/env python3
import subprocess
import os
import sys
import numpy as np
import pandas as pd
from contextlib import suppress
from pathlib import Path
import re
import csv
import argparse
import shutil
import time
#https://realpython.com/python-pathlib/
# FIXME
#strong dependency of file and path names
#cannot pass file with path. Need to pass them separately
#assumptions made for dir struc as standard
#datadir + drug + input
#=======================================================================
#%% specify input and curr dir
homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
#os.chdir(homedir + '/git/LSHTM_analysis/foldx/')
#os.getcwd()
#=======================================================================
#%% command line args
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = None)
arg_parser.add_argument('-g', '--gene', help = 'gene name (case sensitive)', default = None)
arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
arg_parser.add_argument('-p', '--process_dir', help = 'Temp processing dir for running foldX. By default, it assmes homedir + <drug> + processing. Make sure it is somewhere with LOTS of storage as it writes all output!') #FIXME
arg_parser.add_argument('-P', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called <gene>_complex.pdb in input_dir')
arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_mcsm_snps.csv exists')
# FIXME: Doesn't work with 2 chains yet!
arg_parser.add_argument('-c1', '--chain1', help = 'Chain1 ID', default = 'A') # case sensitive
arg_parser.add_argument('-c2', '--chain2', help = 'Chain2 ID', default = 'B') # case sensitive
args = arg_parser.parse_args()
#=======================================================================
#%% variable assignment: input and output
#drug = 'pyrazinamide'
#gene = 'pncA'
#gene_match = gene + '_p.'
#%%=====================================================================
# Command line options
drug = args.drug
gene = args.gene
datadir = args.datadir
indir = args.input_dir
outdir = args.output_dir
process_dir = args.process_dir
mut_filename = args.mutation_file
chainA = args.chain1
chainB = args.chain2
pdb_filename = args.pdb_file
# os.path.splitext will fail interestingly with file.pdb.txt.zip
#pdb_name = os.path.splitext(pdb_file)[0]
# Just the filename, thanks
#pdb_name = Path(in_filename_pdb).stem
# Handle the case where neither 'drug'
# nor (indir,outdir,process_dir) are defined
if not drug:
if not indir or not outdir or not process_dir:
print('ERROR: if "drug" is not specified, you must specify Input, Output, and Process directories')
sys.exit()
#==============
# directories
#==============
if not datadir:
datadir = homedir + '/' + 'git/Data'
if not indir:
indir = datadir + '/' + drug + '/input'
if not outdir:
outdir = datadir + '/' + drug + '/output'
#TODO: perhaps better handled by refactoring code to prevent generating lots of output files!
if not process_dir:
process_dir = datadir + '/' + drug + '/processing'
# Make all paths absolute in case the user forgot
indir = os.path.abspath(indir)
process_dir = os.path.abspath(process_dir)
outdir = os.path.abspath(outdir)
datadir = os.path.abspath(datadir)
#=======
# input
#=======
# FIXME
if pdb_filename:
pdb_filename = os.path.abspath(pdb_filename)
pdb_name = Path(pdb_filename).stem
infile_pdb = pdb_filename
else:
pdb_filename = gene.lower() + '_complex.pdb'
pdb_name = Path(pdb_filename).stem
infile_pdb = indir + '/' + pdb_filename
actual_pdb_filename = Path(infile_pdb).name
if mut_filename:
mutation_file = os.path.abspath(mut_filename)
infile_muts = mutation_file
print('User-provided mutation file in use:', infile_muts)
else:
mutation_file = gene.lower() + '_mcsm_formatted_snps.csv'
infile_muts = outdir + '/' + mutation_file
print('WARNING: Assuming default mutation file:', infile_muts)
#=======
# output
#=======
out_filename = gene.lower() + '_foldx.csv'
outfile_foldx = outdir + '/' + out_filename
print('Arguments being passed:'
, '\nDrug:', args.drug
, '\ngene:', args.gene
, '\ninput dir:', indir
, '\nprocess dir:', process_dir
, '\noutput dir:', outdir
, '\npdb file:', infile_pdb
, '\npdb name:', pdb_name
, '\nactual pdb name:', actual_pdb_filename
, '\nmutation file:', infile_muts
, '\nchain1:', args.chain1
, '\noutput file:', outfile_foldx
, '\n=============================================================')
#### Delay for 10 seconds to check the params ####
print('Sleeping for 10 seconds to give you time to cancel')
time.sleep(10)
#=======================================================================
def getInteractionEnergy(filename):
data = pd.read_csv(filename,sep = '\t')
return data['Interaction Energy'].loc[0]
def getInteractions(filename):
data = pd.read_csv(filename, index_col = 0, header = 0, sep = '\t')
contactList = getIndexes(data,1)
number = len(contactList)
return number
def formatMuts(mut_file,pdbname):
with open(mut_file) as csvfile:
readCSV = csv.reader(csvfile)
muts = []
for row in readCSV:
mut = row[0]
muts.append(mut)
mut_list = []
outfile = process_dir + '/individual_list_' + pdbname + '.txt'
with open(outfile, 'w') as output:
for m in muts:
print(m)
mut = m[:1] + chainA+ m[1:]
mut_list.append(mut)
mut = mut + ';'
print(mut)
output.write(mut)
output.write('\n')
return mut_list
def getIndexes(data, value):
colnames = data.columns.values
listOfPos = list()
result = data.isin([value])
result.columns = colnames
seriesdata = result.any()
columnNames = list(seriesdata[seriesdata==True].index)
for col in columnNames:
rows = list(result[col][result[col]==True].index)
for row in rows:
listOfPos.append((row,col))
return listOfPos
def loadFiles(df):
# load a text file in to np matrix
resultList = []
f = open(df,'r')
for line in f:
line = line.rstrip('\n')
aVals = line.split('\t')
fVals = list(map(np.float32, sVals))
resultList.append(fVals)
f.close()
return np.asarray(resultList, dtype=np.float32)
# TODO: put the subprocess call in a 'def'
#def repairPDB():
# subprocess.call(['foldx'
# , '--command=RepairPDB'
# , '--pdb-dir=' + indir
# , '--pdb=' + actual_pdb_filename
# , '--ionStrength=0.05'#
# , '--pH=7'
# , '--water=PREDICT'
# , '--vdwDesign=1'
# , 'outPDB=true'
# , '--output-dir=' + process_dir])
#=======================================================================
def main():
pdbname = pdb_name
comp = '' # for complex only
mut_filename = infile_muts #pnca_mcsm_snps.csv
mutlist = formatMuts(mut_filename, pdbname)
print(mutlist)
nmuts = len(mutlist)
print(nmuts)
print(mutlist)
print('start')
# some common parameters for foldX
foldx_common=' --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 '
print('\033[95mSTAGE: repair PDB (foldx subprocess) \033[0m')
print('Running foldx RepairPDB for WT')
subprocess.call(['foldx5'
, '--command=RepairPDB'
, foldx_common
, '--pdb-dir=' + os.path.dirname(pdb_filename)
, '--pdb=' + actual_pdb_filename
, 'outPDB=true'
, '--output-dir=' + process_dir])
print('\033[95mCOMPLETED STAGE: repair PDB\033[0m')
print('\n==========================================================')
print('\033[95mSTAGE: Foldx commands BM, PN and SD (foldx subprocess) for WT\033[0m')
print('Running foldx BuildModel for WT')
subprocess.call(['foldx5'
, '--command=BuildModel'
, foldx_common
, '--pdb-dir=' + process_dir
, '--pdb=' + pdbname + '_Repair.pdb'
, '--mutant-file="individual_list_' + pdbname +'.txt"'
, 'outPDB=true'
, '--numberOfRuns=1'
, '--output-dir=' + process_dir], cwd=process_dir)
print('Running foldx PrintNetworks for WT')
subprocess.call(['foldx5'
, '--command=PrintNetworks'
, '--pdb-dir=' + process_dir
, '--pdb=' + pdbname + '_Repair.pdb'
, '--water=PREDICT'
, '--vdwDesign=1'
, '--output-dir=' + process_dir], cwd=process_dir)
print('Running foldx SequenceDetail for WT')
subprocess.call(['foldx5'
, '--command=SequenceDetail'
, '--pdb-dir=' + process_dir
, '--pdb=' + pdbname + '_Repair.pdb'
, '--water=PREDICT'
, '--vdwDesign=1'
, '--output-dir=' + process_dir], cwd=process_dir)
print('\033[95mCOMPLETED STAGE: Foldx commands BM, PN and SD\033[0m')
print('\n==========================================================')
print('\033[95mSTAGE: Print Networks (foldx subprocess) for MT\033[0m')
for n in range(1,nmuts+1):
print('\033[95mNETWORK:\033[0m', n)
print('Running foldx PrintNetworks for mutation', n)
subprocess.call(['foldx5'
, '--command=PrintNetworks'
, '--pdb-dir=' + process_dir
, '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
, '--water=PREDICT'
, '--vdwDesign=1'
, '--output-dir=' + process_dir], cwd=process_dir)
print('\033[95mCOMPLETED STAGE: Print Networks (foldx subprocess) for MT\033[0m')
print('\n==========================================================')
print('\033[95mSTAGE: Rename Mutation Files (shell)\033[0m')
for n in range(1,nmuts+1):
print('\033[95mMUTATION:\033[0m', n)
print('\033[96mCommand:\033[0m mutrenamefiles.sh %s %s %s' % (pdbname, str(n), process_dir ))
#FIXME: bad design and needs to be done in a pythonic way
with suppress(Exception):
subprocess.check_output(['bash', 'mutrenamefiles.sh', pdbname, str(n), process_dir])
print('\033[95mCOMPLETED STAGE: Rename Mutation Files (shell)\033[0m')
print('\n==========================================================')
print('\033[95mSTAGE: Rename Files (shell) for WT\033[0m')
# FIXME: this is bad design and needs to be done in a pythonic way
out = subprocess.check_output(['bash','renamefiles.sh', pdbname, process_dir])
print('\033[95mCOMPLETED STAGE: Rename Files (shell) for WT\033[0m')
print('\n==========================================================')
if comp=='y':
print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for WT\033[0m')
chain1=chainA
chain2=chainB
subprocess.call(['foldx5'
, '--command=AnalyseComplex'
, '--pdb-dir=' + process_dir
, '--pdb=' + pdbname + '_Repair.pdb'
, '--analyseComplexChains=' + chain1 + ',' + chain2
, '--water=PREDICT'
, '--vdwDesign=1'
, '--output-dir=' + process_dir], cwd=process_dir)
# FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
ac_source = process_dir + '/Summary_' + pdbname + '_Repair_AC.fxout'
ac_dest = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
shutil.copyfile(ac_source, ac_dest)
print('\033[95mCOMPLETED STAGE: foldx AnalyseComplex (subprocess) for WT:\033[0m', n)
for n in range(1,nmuts+1):
print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for mutation:\033[0m', n)
subprocess.call(['foldx5'
, '--command=AnalyseComplex'
, '--pdb-dir=' + process_dir
, '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
, '--analyseComplexChains=' + chain1 + ',' + chain2
, '--water=PREDICT'
, '--vdwDesign=1'
, '--output-dir=' + process_dir], cwd=process_dir)
# FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
ac_mut_source = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) +'_AC.fxout'
ac_mut_dest = process_dir + '/Summary_' + pdbname + '_Repair)' + str(n) +'_AC.txt'
shutil.copyfile(ac_mut_source, ac_mut_dest)
print('\033[95mCOMPLETED STAGE: foldx AnalyseComplex (subprocess) for mutation:\033[0m', n)
print('\n==========================================================')
interactions = ['Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']
dGdatafile = process_dir + '/Dif_' + pdbname + '_Repair.txt'
dGdata = pd.read_csv(dGdatafile, sep = '\t')
ddG=[]
print('ddG')
print(len(dGdata))
for i in range(0,len(dGdata)):
ddG.append(dGdata['total energy'].loc[i])
nint = len(interactions)
wt_int = []
for i in interactions:
filename = process_dir + '/Matrix_' + i + '_'+ pdbname + '_Repair_PN.txt'
wt_int.append(getInteractions(filename))
print('wt')
print(wt_int)
ntotal = nint+1
print(ntotal)
print(nmuts)
data = np.empty((ntotal,nmuts))
data[0] = ddG
print(data)
for i in range(0,len(interactions)):
d=[]
p=0
for n in range(1, nmuts+1):
print(i)
filename = process_dir + '/Matrix_' + interactions[i] + '_' + pdbname + '_Repair_' + str(n) + '_PN.txt'
mut = getInteractions(filename)
diff = wt_int[i] - mut
print(diff)
print(wt_int[i])
print(mut)
d.append(diff)
print(d)
data[i+1] = d
interactions = ['ddG', 'Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']
print(interactions)
IE = []
if comp=='y':
wtfilename = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
wtE = getInteractionEnergy(wtfilename)
print(wtE)
for n in range(1,nmuts+1):
print(n)
filename = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) + '_AC.txt'
mutE = getInteractionEnergy(filename)
print(mutE)
diff = wtE - mutE
print(diff)
IE.append(diff)
print(IE)
IEresults = pd.DataFrame(IE,columns = ['Interaction Energy'], index = mutlist)
IEfilename = 'foldx_complexresults_'+pdbname+'.csv'
IEresults.to_csv(IEfilename)
print(len(IE))
data = np.append(data,[IE], axis = 0)
print(data)
interactions = ['ddG','Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS','Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS','Interaction Energy']
mut_file = process_dir + '/individual_list_' + pdbname + '.txt'
with open(mut_file) as csvfile:
readCSV = csv.reader(csvfile)
mutlist = []
for row in readCSV:
mut = row[0]
mutlist.append(mut)
print(mutlist)
print(len(mutlist))
print(data)
results = pd.DataFrame(data, columns = mutlist, index = interactions)
results.append(ddG)
#print(results.head())
# my style formatted results
results2 = results.T # transpose df
results2.index.name = 'mutationinformation' # assign name to index
results2 = results2.reset_index() # turn it into a columns
results2['mutationinformation'] = results2['mutationinformation'].replace({r'([A-Z]{1})[A-Z]{1}([0-9]+[A-Z]{1});' : r'\1 \2'}, regex = True) # capture mcsm style muts (i.e not the chain id)
results2['mutationinformation'] = results2['mutationinformation'].str.replace(' ', '') # remove empty space
results2.rename(columns = {'Distances': 'Contacts'}, inplace = True)
# lower case columns
results2.columns = results2.columns.str.lower()
print('Writing file in the format below:\n'
, results2.head()
, '\nNo. of rows:', len(results2)
, '\nNo. of cols:', len(results2.columns))
outputfilename = outfile_foldx
#outputfilename = 'foldx_results_' + pdbname + '.csv'
#results.to_csv(outputfilename)
results2.to_csv(outputfilename, index = False)
print ('end')
if __name__ == '__main__':
main()

View file

@ -0,0 +1,10 @@
PDB=$1
A=$2
B=$3
n=$4
OUTDIR=$5
cd ${OUTDIR}
logger "Running mutruncomplex"
foldx --command=AnalyseComplex --pdb="${PDB}_Repair_${n}.pdb" --analyseComplexChains=${A},${B} --water=PREDICT --vdwDesign=1
cp ${OUTDIR}/Summary_${PDB}_Repair_${n}_AC.fxout ${OUTDIR}/Summary_${PDB}_Repair_${n}_AC.txt
#sed -i .bak -e 1,8d ${OUTDIR}/Summary_${PDB}_Repair_${n}_AC.txt

View file

@ -0,0 +1,9 @@
INDIR=$1
PDB=$2
OUTDIR=$3
cd ${OUTDIR}
logger "Running repairPDB"
#foldx --command=RepairPDB --pdb="${PDB}.pdb" --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 outPDB=true --output-dir=${OUTDIR}
foldx --command=RepairPDB --pdb-dir=${INDIR} --pdb=${PDB} --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 outPDB=true --output-dir=${OUTDIR}

View file

@ -0,0 +1,7 @@
PDB=$1
n=$2
OUTDIR=$3
logger "Running runPrintNetworks"
cd ${OUTDIR}
foldx --command=PrintNetworks --pdb="${PDB}_Repair_${n}.pdb" --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}

View file

@ -0,0 +1,9 @@
PDB=$1
A=$2
B=$3
OUTDIR=$4
cd ${OUTDIR}
logger "Running runcomplex"
foldx --command=AnalyseComplex --pdb="${PDB}_Repair.pdb" --analyseComplexChains=${A},${B} --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}
cp ${OUTDIR}/Summary_${PDB}_Repair_AC.fxout ${OUTDIR}/Summary_${PDB}_Repair_AC.txt
#sed -i .bak -e 1,8d ${OUTDIR}/Summary_${PDB}_Repair_AC.txt

View file

@ -0,0 +1,9 @@
PDB=$1
OUTDIR=$2
cd ${OUTDIR}
pwd
ls -l
logger "Running runfoldx"
foldx --command=BuildModel --pdb="${PDB}_Repair.pdb" --mutant-file="individual_list_${PDB}.txt" --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 --out-pdb=true --numberOfRuns=1 --output-dir=${OUTDIR}
foldx --command=PrintNetworks --pdb="${PDB}_Repair.pdb" --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}
foldx --command=SequenceDetail --pdb="${PDB}_Repair.pdb" --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}

View file

@ -0,0 +1,2 @@
S2C
S2F
1 S2C
2 S2F

63
foldx/test2/mutrenamefiles.sh Executable file
View file

@ -0,0 +1,63 @@
PDB=$1
n=$2
OUTDIR=$3
cd ${OUTDIR}
#cd /home/git/LSHTM_analysis/foldx/test2
cp Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout Matrix_Hbonds_${PDB}_Repair_${n}_PN.txt
sed -n '5,190p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_RR_${PDB}_Repair_${n}_PN.txt
sed -n '194,379p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_MM_${PDB}_Repair_${n}_PN.txt
sed -n '383,568p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_SM_${PDB}_Repair_${n}_PN.txt
sed -n '572,757p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_SS_${PDB}_Repair_${n}_PN.txt
cp Matrix_Distances_${PDB}_Repair_${n}_PN.fxout Matrix_Distances_${PDB}_Repair_${n}_PN.txt
sed -i '1,4d' Matrix_Distances_${PDB}_Repair_${n}_PN.txt
cp Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout Matrix_Volumetric_${PDB}_Repair_${n}_PN.txt
sed -n '5,190p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_RR_${PDB}_Repair_${n}_PN.txt
sed -n '194,379p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_MM_${PDB}_Repair_${n}_PN.txt
sed -n '383,568p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_SM_${PDB}_Repair_${n}_PN.txt
sed -n '572,757p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_SS_${PDB}_Repair_${n}_PN.txt
cp Matrix_Electro_${PDB}_Repair_${n}_PN.fxout Matrix_Electro_${PDB}_Repair_${n}_PN.txt
sed -n '5,190p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_RR_${PDB}_Repair_${n}_PN.txt
sed -n '194,379p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_MM_${PDB}_Repair_${n}_PN.txt
sed -n '383,568p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_SM_${PDB}_Repair_${n}_PN.txt
sed -n '572,757p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_SS_${PDB}_Repair_${n}_PN.txt
cp Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout Matrix_Disulfide_${PDB}_Repair_${n}_PN.txt
sed -n '5,190p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_RR_${PDB}_Repair_${n}_PN.txt
sed -n '194,379p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_MM_${PDB}_Repair_${n}_PN.txt
sed -n '383,568p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_SM_${PDB}_Repair_${n}_PN.txt
sed -n '572,757p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_SS_${PDB}_Repair_${n}_PN.txt
cp Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout Matrix_Partcov_${PDB}_Repair_${n}_PN.txt
sed -n '5,190p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_RR_${PDB}_Repair_${n}_PN.txt
sed -n '194,379p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_MM_${PDB}_Repair_${n}_PN.txt
sed -n '383,568p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_SM_${PDB}_Repair_${n}_PN.txt
sed -n '572,757p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_SS_${PDB}_Repair_${n}_PN.txt
cp Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout Matrix_VdWClashes_${PDB}_Repair_${n}_PN.txt
sed -n '5,190p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_RR_${PDB}_Repair_${n}_PN.txt
sed -n '194,379p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_MM_${PDB}_Repair_${n}_PN.txt
sed -n '383,568p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_SM_${PDB}_Repair_${n}_PN.txt
sed -n '572,757p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_SS_${PDB}_Repair_${n}_PN.txt
cp AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.fxout AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.txt
sed -i '1,2d' AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.txt
cp AllAtoms_Electro_${PDB}_Repair_${n}_PN.fxout AllAtoms_Electro_${PDB}_Repair_${n}_PN.txt
sed -i '1,2d' AllAtoms_Electro_${PDB}_Repair_${n}_PN.txt
cp AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.fxout AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.txt
sed -i '1,2d' AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.txt
cp AllAtoms_Partcov_${PDB}_Repair_${n}_PN.fxout AllAtoms_Partcov_${PDB}_Repair_${n}_PN.txt
sed -i '1,2d' AllAtoms_Partcov_${PDB}_Repair_${n}_PN.txt
cp AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.fxout AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.txt
sed -i '1,2d' AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.txt
cp AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.fxout AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.txt
sed -i '1,2d' AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.txt
cp InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.fxout InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.txt
sed -i '1,5d' InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.txt
cp InteractingResidues_Distances_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Distances_${PDB}_Repair_${n}_PN.txt
sed -i '1,5d' InteractingResidues_Distances_${PDB}_Repair_${n}_PN.txt
cp InteractingResidues_Electro_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Electro_${PDB}_Repair_${n}_PN.txt
sed -i '1,5d' InteractingResidues_Electro_${PDB}_Repair_${n}_PN.txt
cp InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.txt
sed -i '1,5d' InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.txt
cp InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.txt
sed -i '1,5d' InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.txt
cp InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.txt
sed -i '1,5d' InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.txt
cp InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.txt
sed -i '1,5d' InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.txt

64
foldx/test2/renamefiles.sh Executable file
View file

@ -0,0 +1,64 @@
PDB=$1
OUTDIR=$2
cd ${OUTDIR}
#cd /home/git/LSHTM_analysis/foldx/test2
cp Dif_${PDB}_Repair.fxout Dif_${PDB}_Repair.txt
sed -i '1,8d' Dif_${PDB}_Repair.txt
cp Matrix_Hbonds_${PDB}_Repair_PN.fxout Matrix_Hbonds_${PDB}_Repair_PN.txt
sed -n '5,190p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_RR_${PDB}_Repair_PN.txt
sed -n '194,379p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_MM_${PDB}_Repair_PN.txt
sed -n '383,568p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_SM_${PDB}_Repair_PN.txt
sed -n '572,757p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_SS_${PDB}_Repair_PN.txt
cp Matrix_Distances_${PDB}_Repair_PN.fxout Matrix_Distances_${PDB}_Repair_PN.txt
sed -i '1,4d' Matrix_Distances_${PDB}_Repair_PN.txt
cp Matrix_Volumetric_${PDB}_Repair_PN.fxout Matrix_Volumetric_${PDB}_Repair_PN.txt
sed -n '5,190p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_RR_${PDB}_Repair_PN.txt
sed -n '194,379p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_MM_${PDB}_Repair_PN.txt
sed -n '383,568p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_SM_${PDB}_Repair_PN.txt
sed -n '572,757p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_SS_${PDB}_Repair_PN.txt
cp Matrix_Electro_${PDB}_Repair_PN.fxout Matrix_Electro_${PDB}_Repair_PN.txt
sed -n '5,190p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_RR_${PDB}_Repair_PN.txt
sed -n '194,379p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_MM_${PDB}_Repair_PN.txt
sed -n '383,568p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_SM_${PDB}_Repair_PN.txt
sed -n '572,757p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_SS_${PDB}_Repair_PN.txt
cp Matrix_Disulfide_${PDB}_Repair_PN.fxout Matrix_Disulfide_${PDB}_Repair_PN.txt
sed -n '5,190p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_RR_${PDB}_Repair_PN.txt
sed -n '194,379p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_MM_${PDB}_Repair_PN.txt
sed -n '383,568p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_SM_${PDB}_Repair_PN.txt
sed -n '572,757p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_SS_${PDB}_Repair_PN.txt
cp Matrix_Partcov_${PDB}_Repair_PN.fxout Matrix_Partcov_${PDB}_Repair_PN.txt
sed -n '5,190p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_RR_${PDB}_Repair_PN.txt
sed -n '194,379p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_MM_${PDB}_Repair_PN.txt
sed -n '383,568p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_SM_${PDB}_Repair_PN.txt
sed -n '572,757p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_SS_${PDB}_Repair_PN.txt
cp Matrix_VdWClashes_${PDB}_Repair_PN.fxout Matrix_VdWClashes_${PDB}_Repair_PN.txt
sed -n '5,190p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_RR_${PDB}_Repair_PN.txt
sed -n '194,379p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_MM_${PDB}_Repair_PN.txt
sed -n '383,568p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_SM_${PDB}_Repair_PN.txt
sed -n '572,757p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_SS_${PDB}_Repair_PN.txt
cp AllAtoms_Disulfide_${PDB}_Repair_PN.fxout AllAtoms_Disulfide_${PDB}_Repair_PN.txt
sed -i '1,2d' AllAtoms_Disulfide_${PDB}_Repair_PN.txt
cp AllAtoms_Electro_${PDB}_Repair_PN.fxout AllAtoms_Electro_${PDB}_Repair_PN.txt
sed -i '1,2d' AllAtoms_Electro_${PDB}_Repair_PN.txt
cp AllAtoms_Hbonds_${PDB}_Repair_PN.fxout AllAtoms_Hbonds_${PDB}_Repair_PN.txt
sed -i '1,2d' AllAtoms_Hbonds_${PDB}_Repair_PN.txt
cp AllAtoms_Partcov_${PDB}_Repair_PN.fxout AllAtoms_Partcov_${PDB}_Repair_PN.txt
sed -i '1,2d' AllAtoms_Partcov_${PDB}_Repair_PN.txt
cp AllAtoms_VdWClashes_${PDB}_Repair_PN.fxout AllAtoms_VdWClashes_${PDB}_Repair_PN.txt
sed -i '1,2d' AllAtoms_VdWClashes_${PDB}_Repair_PN.txt
cp AllAtoms_Volumetric_${PDB}_Repair_PN.fxout AllAtoms_Volumetric_${PDB}_Repair_PN.txt
sed -i '1,2d' AllAtoms_Volumetric_${PDB}_Repair_PN.txt
cp InteractingResidues_VdWClashes_${PDB}_Repair_PN.fxout InteractingResidues_VdWClashes_${PDB}_Repair_PN.txt
sed -i '1,5d' InteractingResidues_VdWClashes_${PDB}_Repair_PN.txt
cp InteractingResidues_Distances_${PDB}_Repair_PN.fxout InteractingResidues_Distances_${PDB}_Repair_PN.txt
sed -i '1,5d' InteractingResidues_Distances_${PDB}_Repair_PN.txt
cp InteractingResidues_Electro_${PDB}_Repair_PN.fxout InteractingResidues_Electro_${PDB}_Repair_PN.txt
sed -i '1,5d' InteractingResidues_Electro_${PDB}_Repair_PN.txt
cp InteractingResidues_Hbonds_${PDB}_Repair_PN.fxout InteractingResidues_Hbonds_${PDB}_Repair_PN.txt
sed -i '1,5d' InteractingResidues_Hbonds_${PDB}_Repair_PN.txt
cp InteractingResidues_Partcov_${PDB}_Repair_PN.fxout InteractingResidues_Partcov_${PDB}_Repair_PN.txt
sed -i '1,5d' InteractingResidues_Partcov_${PDB}_Repair_PN.txt
cp InteractingResidues_Volumetric_${PDB}_Repair_PN.fxout InteractingResidues_Volumetric_${PDB}_Repair_PN.txt
sed -i '1,5d' InteractingResidues_Volumetric_${PDB}_Repair_PN.txt
cp InteractingResidues_Disulfide_${PDB}_Repair_PN.fxout InteractingResidues_Disulfide_${PDB}_Repair_PN.txt
sed -i '1,5d' InteractingResidues_Disulfide_${PDB}_Repair_PN.txt

239965
foldx/test2/rotabase.txt Normal file

File diff suppressed because it is too large Load diff

1
foldx/test2/runFoldx.py Symbolic link
View file

@ -0,0 +1 @@
../runFoldx.py

250
foldx/test2/runFoldx_test.py Executable file
View file

@ -0,0 +1,250 @@
#!/usr/bin/env python3
import subprocess
import os
import numpy as np
import pandas as pd
from contextlib import suppress
import re
import csv
def getInteractions(filename):
data = pd.read_csv(filename, index_col=0, header =0, sep="\t")
contactList = getIndexes(data,1)
print(contactList)
number = len(contactList)
return number
def formatMuts(mut_file,pdbname):
with open(mut_file) as csvfile:
readCSV = csv.reader(csvfile)
muts = []
for row in readCSV:
mut = row[0]
muts.append(mut)
mut_list = []
outfile = "/home/tanu/git/LSHTM_analysis/foldx/test2/individual_list_"+pdbname+".txt"
with open(outfile, "w") as output:
for m in muts:
print(m)
mut = m[:1]+'A'+m[1:]
mut_list.append(mut)
mut = mut + ";"
print(mut)
output.write(mut)
output.write("\n")
return mut_list
def getIndexes(data, value):
colnames = data.columns.values
listOfPos = list()
result = data.isin([value])
result.columns=colnames
seriesdata = result.any()
columnNames = list(seriesdata[seriesdata==True].index)
for col in columnNames:
rows = list(result[col][result[col]==True].index)
for row in rows:
listOfPos.append((row,col))
return listOfPos
def loadFiles(df):
# load a text file in to np matrix
resultList = []
f = open(df,'r')
for line in f:
line = line.rstrip('\n')
aVals = line.split("\t")
fVals = list(map(np.float32, sVals))
resultList.append(fVals)
f.close()
return np.asarray(resultList, dtype=np.float32)
#=======================================================================
def main():
pdbname = '3pl1'
mut_filename = "pnca_muts_sample.csv"
mutlist = formatMuts(mut_filename, pdbname)
print(mutlist)
nmuts = len(mutlist)+1
print(nmuts)
print(mutlist)
print("start")
output = subprocess.check_output(['bash', 'runfoldx.sh', pdbname])
print("end")
for n in range(1,nmuts):
print(n)
with suppress(Exception):
subprocess.check_output(['bash', 'runPrintNetworks.sh', pdbname,str(n)])
for n in range(1,nmuts):
print(n)
with suppress(Exception):
subprocess.check_output(['bash', 'mutrenamefiles.sh', pdbname,str(n)])
out = subprocess.check_output(['bash','renamefiles.sh',pdbname])
dGdatafile = "/home/tanu/git/LSHTM_analysis/foldx/test2/Dif_"+pdbname+"_Repair.txt"
dGdata = pd.read_csv(dGdatafile, sep="\t")
print(dGdata)
ddG=[]
for i in range(0,len(dGdata)):
ddG.append(dGdata['total energy'].loc[i])
print(ddG)
distfile = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Distances_"+pdbname+"_Repair_PN.txt"
wt_nc = getInteractions(distfile)
elecfileRR = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Electro_RR_"+pdbname+"_Repair_PN.txt"
wt_neRR = getInteractions(elecfileRR)
elecfileMM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Electro_MM_"+pdbname+"_Repair_PN.txt"
wt_neMM = getInteractions(elecfileMM)
elecfileSM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Electro_SM_"+pdbname+"_Repair_PN.txt"
wt_neSM = getInteractions(elecfileSM)
elecfileSS = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Electro_SS_"+pdbname+"_Repair_PN.txt"
wt_neSS = getInteractions(elecfileSS)
disufileRR = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Disulfide_RR_"+pdbname+"_Repair_PN.txt"
wt_ndRR = getInteractions(disufileRR)
disufileMM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Disulfide_MM_"+pdbname+"_Repair_PN.txt"
wt_ndMM = getInteractions(disufileMM)
disufileSM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Disulfide_SM_"+pdbname+"_Repair_PN.txt"
wt_ndSM = getInteractions(disufileSM)
disufileSS = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Disulfide_SS_"+pdbname+"_Repair_PN.txt"
wt_ndSS = getInteractions(disufileSS)
hbndfileRR = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Hbonds_RR_"+pdbname+"_Repair_PN.txt"
wt_nhRR = getInteractions(hbndfileRR)
hbndfileMM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Hbonds_MM_"+pdbname+"_Repair_PN.txt"
wt_nhMM = getInteractions(hbndfileMM)
hbndfileSM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Hbonds_SM_"+pdbname+"_Repair_PN.txt"
wt_nhSM = getInteractions(hbndfileSM)
hbndfileSS = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Hbonds_SS_"+pdbname+"_Repair_PN.txt"
wt_nhSS = getInteractions(hbndfileSS)
partfileRR = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Partcov_RR_"+pdbname+"_Repair_PN.txt"
wt_npRR = getInteractions(partfileRR)
partfileMM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Partcov_MM_"+pdbname+"_Repair_PN.txt"
wt_npMM = getInteractions(partfileMM)
partfileSM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Partcov_SM_"+pdbname+"_Repair_PN.txt"
wt_npSM = getInteractions(partfileSM)
partfileSS = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Partcov_SS_"+pdbname+"_Repair_PN.txt"
wt_npSS = getInteractions(partfileSS)
vdwcfileRR = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_VdWClashes_RR_"+pdbname+"_Repair_PN.txt"
wt_nvRR = getInteractions(vdwcfileRR)
vdwcfileMM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_VdWClashes_MM_"+pdbname+"_Repair_PN.txt"
wt_nvMM = getInteractions(vdwcfileMM)
vdwcfileSM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_VdWClashes_SM_"+pdbname+"_Repair_PN.txt"
wt_nvSM = getInteractions(vdwcfileSM)
vdwcfileSS = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_VdWClashes_SS_"+pdbname+"_Repair_PN.txt"
wt_nvSS = getInteractions(vdwcfileSS)
volufileRR = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Volumetric_RR_"+pdbname+"_Repair_PN.txt"
wt_nvoRR = getInteractions(volufileRR)
volufileMM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Volumetric_MM_"+pdbname+"_Repair_PN.txt"
wt_nvoMM = getInteractions(volufileMM)
volufileSM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Volumetric_SM_"+pdbname+"_Repair_PN.txt"
wt_nvoSM = getInteractions(volufileSM)
volufileSS = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Volumetric_SS_"+pdbname+"_Repair_PN.txt"
wt_nvoSS = getInteractions(volufileSS)
dnc = []
dneRR = []
dneMM = []
dneSM = []
dneSS = []
dndRR = []
dndMM = []
dndSM = []
dndSS = []
dnhRR = []
dnhMM = []
dnhSM = []
dnhSS = []
dnpRR = []
dnpMM = []
dnpSM = []
dnpSS = []
dnvRR = []
dnvMM = []
dnvSM = []
dnvSS = []
dnvoRR = []
dnvoMM = []
dnvoSM = []
dnvoSS = []
for n in range(1, nmuts):
filename = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Distances_"+pdbname+"_Repair_" + str(n)+"_PN.txt"
mut_nc = getInteractions(filename)
diffc = wt_nc - mut_nc
dnc.append(diffc)
filename = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Electro_RR_"+pdbname+"_Repair_" + str(n)+"_PN.txt"
mut_neRR = getInteractions(filename)
diffeRR = wt_neRR - mut_neRR
dneRR.append(diffeRR)
filename = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Disulfide_RR_"+pdbname+"_Repair_" + str(n)+"_PN.txt"
mut_ndRR = getInteractions(filename)
diffdRR = wt_ndRR - mut_ndRR
dndRR.append(diffdRR)
filename = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Hbonds_RR_"+pdbname+"_Repair_" + str(n)+"_PN.txt"
mut_nhRR = getInteractions(filename)
diffhRR = wt_nhRR - mut_nhRR
dnhRR.append(diffhRR)
filename = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Partcov_RR_"+pdbname+"_Repair_" + str(n)+"_PN.txt"
mut_npRR = getInteractions(filename)
diffpRR = wt_npRR - mut_npRR
dnpRR.append(diffpRR)
filename = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_VdWClashes_RR_"+pdbname+"_Repair_" + str(n)+"_PN.txt"
mut_nvRR = getInteractions(filename)
diffvRR = wt_nvRR - mut_nvRR
dnvRR.append(diffvRR)
filename = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Volumetric_RR_"+pdbname+"_Repair_" + str(n)+"_PN.txt"
mut_nvoRR = getInteractions(filename)
diffvoRR = wt_nvoRR - mut_nvoRR
dnvoRR.append(diffvoRR)
print(dnc)
print(dneRR)
print(dndRR)
print(dnhRR)
print(dnpRR)
print(dnvRR)
print(dnvoRR)
results = pd.DataFrame([(ddG),(dnc),(dneRR),(dndRR),(dnhRR),(dnpRR),(dnvRR),(dnvoRR)], columns=mutlist, index=["ddG","contacts","electro","disulfide","hbonds","partcov","VdWClashes","volumetric"])
results.append(ddG)
print(results)
results2 = results.T # transpose df
outputfilename = "foldx_results_"+pdbname+".csv"
# results.to_csv(outputfilename)
results2.to_csv(outputfilename)
if __name__ == "__main__":
main()

456
foldx/test2/runFoldx_test2.py Executable file
View file

@ -0,0 +1,456 @@
#!/usr/bin/env python3
import subprocess
import os
import sys
import numpy as np
import pandas as pd
from contextlib import suppress
from pathlib import Path
import re
import csv
import argparse
import shutil
#https://realpython.com/python-pathlib/
# FIXME
#strong dependency of file and path names
#cannot pass file with path. Need to pass them separately
#assumptions made for dir struc as standard
#datadir + drug + input
#=======================================================================
#%% specify input and curr dir
homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
#os.chdir(homedir + '/git/LSHTM_analysis/foldx/')
#os.getcwd()
#=======================================================================
#%% command line args
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = None)
arg_parser.add_argument('-g', '--gene', help = 'gene name (case sensitive)', default = None)
arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
arg_parser.add_argument('-p', '--process_dir', help = 'Temp processing dir for running foldX. By default, it assmes homedir + <drug> + processing. Make sure it is somewhere with LOTS of storage as it writes all output!') #FIXME
arg_parser.add_argument('-pdb', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called <gene>_complex.pdb in input_dir')
arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_mcsm_snps.csv exists')
# FIXME: Doesn't work with 2 chains yet!
arg_parser.add_argument('-c1', '--chain1', help = 'Chain1 ID', default = 'A') # case sensitive
arg_parser.add_argument('-c2', '--chain2', help = 'Chain2 ID', default = 'B') # case sensitive
args = arg_parser.parse_args()
#=======================================================================
#%% variable assignment: input and output
#drug = 'pyrazinamide'
#gene = 'pncA'
#gene_match = gene + '_p.'
#%%=====================================================================
# Command line options
drug = args.drug
gene = args.gene
datadir = args.datadir
indir = args.input_dir
outdir = args.output_dir
process_dir = args.process_dir
mut_filename = args.mutation_file
chainA = args.chain1
chainB = args.chain2
pdb_filename = args.pdb_file
# os.path.splitext will fail interestingly with file.pdb.txt.zip
#pdb_name = os.path.splitext(pdb_file)[0]
# Just the filename, thanks
#pdb_name = Path(in_filename_pdb).stem
#==============
# directories
#==============
if not datadir:
datadir = homedir + '/' + 'git/Data'
if not indir:
indir = datadir + '/' + drug + '/input'
if not outdir:
outdir = datadir + '/' + drug + '/output'
#TODO: perhaps better handled by refactoring code to prevent generating lots of output files!
#if not process_dir:
# process_dir = datadir + '/' + drug + '/processing'
# Make all paths absolute in case the user forgot
indir = os.path.abspath(indir)
process_dir = os.path.abspath(process_dir)
outdir = os.path.abspath(outdir)
datadir = os.path.abspath(datadir)
#=======
# input
#=======
# FIXME
if pdb_filename:
pdb_name = Path(pdb_filename).stem
else:
pdb_filename = gene.lower() + '_complex.pdb'
pdb_name = Path(pdb_filename).stem
infile_pdb = indir + '/' + pdb_filename
actual_pdb_filename = Path(infile_pdb).name
#actual_pdb_filename = os.path.abspath(infile_pdb)
if mut_filename:
mutation_file = os.path.abspath(mut_filename)
infile_muts = mutation_file
print('User-provided mutation file in use:', infile_muts)
else:
mutation_file = gene.lower() + '_mcsm_formatted_snps.csv'
infile_muts = outdir + '/' + mutation_file
print('WARNING: Assuming default mutation file:', infile_muts)
#=======
# output
#=======
out_filename = gene.lower() + '_foldx.csv'
outfile_foldx = outdir + '/' + out_filename
print('Arguments being passed:'
, '\nDrug:', args.drug
, '\ngene:', args.gene
, '\ninput dir:', indir
, '\nprocess dir:', process_dir
, '\noutput dir:', outdir
, '\npdb file:', infile_pdb
, '\npdb name:', pdb_name
, '\nactual pdb name:', actual_pdb_filename
, '\nmutation file:', infile_muts
, '\nchain1:', args.chain1
, '\noutput file:', outfile_foldx
, '\n=============================================================')
#=======================================================================
def getInteractionEnergy(filename):
data = pd.read_csv(filename,sep = '\t')
return data['Interaction Energy'].loc[0]
def getInteractions(filename):
data = pd.read_csv(filename, index_col = 0, header = 0, sep = '\t')
contactList = getIndexes(data,1)
number = len(contactList)
return number
def formatMuts(mut_file,pdbname):
with open(mut_file) as csvfile:
readCSV = csv.reader(csvfile)
muts = []
for row in readCSV:
mut = row[0]
muts.append(mut)
mut_list = []
outfile = process_dir + '/individual_list_' + pdbname + '.txt'
with open(outfile, 'w') as output:
for m in muts:
print(m)
mut = m[:1] + chainA+ m[1:]
mut_list.append(mut)
mut = mut + ';'
print(mut)
output.write(mut)
output.write('\n')
return mut_list
def getIndexes(data, value):
colnames = data.columns.values
listOfPos = list()
result = data.isin([value])
result.columns = colnames
seriesdata = result.any()
columnNames = list(seriesdata[seriesdata==True].index)
for col in columnNames:
rows = list(result[col][result[col]==True].index)
for row in rows:
listOfPos.append((row,col))
return listOfPos
def loadFiles(df):
# load a text file in to np matrix
resultList = []
f = open(df,'r')
for line in f:
line = line.rstrip('\n')
aVals = line.split('\t')
fVals = list(map(np.float32, sVals))
resultList.append(fVals)
f.close()
return np.asarray(resultList, dtype=np.float32)
# TODO: use this code pattern rather than invoking bash
#def repairPDB():
# subprocess.call(['foldx'
# , '--command=RepairPDB'
# , '--pdb-dir=' + indir
# , '--pdb=' + actual_pdb_filename
# , '--ionStrength=0.05'#
# , '--pH=7'
# , '--water=PREDICT'
# , '--vdwDesign=1'
# , 'outPDB=true'
# , '--output-dir=' + process_dir])
#=======================================================================
def main():
pdbname = pdb_name
comp = '' # for complex only
mut_filename = infile_muts #pnca_mcsm_snps.csv
mutlist = formatMuts(mut_filename, pdbname)
print(mutlist)
nmuts = len(mutlist)
print(nmuts)
print(mutlist)
print('start')
#subprocess.check_output(['bash','repairPDB.sh', pdbname, process_dir])
print('\033[95mSTAGE: repair PDB\033[0m')
print('EXECUTING: repairPDB.sh %s %s %s' % (indir, actual_pdb_filename, process_dir))
#subprocess.check_output(['bash','repairPDB.sh', indir, actual_pdb_filename, process_dir])
# once you decide to use the function
# repairPDB(pdbname)
# FIXME: put this hack elsewhere
foldx_common=' --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 '
subprocess.call(['foldx'
, '--command=RepairPDB'
, foldx_common
, '--pdb-dir=' + indir
, '--pdb=' + actual_pdb_filename
, 'outPDB=true'
, '--output-dir=' + process_dir])
print('\033[95mCOMPLETE: repair PDB\033[0m')
print('\033[95mSTAGE: run FoldX (subprocess)\033[0m')
print('EXECUTING: runfoldx.sh %s %s ' % (pdbname, process_dir))
#output = subprocess.check_output(['bash', 'runfoldx.sh', pdbname, process_dir])
print('Running foldx BuildModel')
subprocess.call(['foldx'
, '--command=BuildModel'
, foldx_common
, '--pdb-dir=' + process_dir
, '--pdb=' + pdbname + '_Repair.pdb'
, '--mutant-file="individual_list_' + pdbname +'.txt"'
, 'outPDB=true'
, '--numberOfRuns=1'
, '--output-dir=' + process_dir], cwd=process_dir)
print('Running foldx PrintNetworks')
subprocess.call(['foldx'
, '--command=PrintNetworks'
, '--pdb-dir=' + process_dir
, '--pdb=' + pdbname + '_Repair.pdb'
, '--water=PREDICT'
, '--vdwDesign=1'
, '--output-dir=' + process_dir], cwd=process_dir)
print('Running foldx SequenceDetail')
subprocess.call(['foldx'
, '--command=SequenceDetail'
, '--pdb-dir=' + process_dir
, '--pdb=' + pdbname + '_Repair.pdb'
, '--water=PREDICT'
, '--vdwDesign=1'
, '--output-dir=' + process_dir], cwd=process_dir)
print('\033[95mCOMPLETE: run FoldX (subprocess)\033[0m')
print('\033[95mSTAGE: Print Networks (shell)\033[0m')
for n in range(1,nmuts+1):
print('\033[95mNETWORK:\033[0m', n)
#print('\033[96mCommand:\033[0m runPrintNetworks.sh %s %s %s' % (pdbname, str(n), process_dir ))
#with suppress(Exception):
#foldx --command=PrintNetworks --pdb="${PDB}_Repair_${n}.pdb" --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}
print('Running foldx PrintNetworks for mutation', n)
subprocess.call(['foldx'
, '--command=PrintNetworks'
, '--pdb-dir=' + process_dir
, '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
, '--water=PREDICT'
, '--vdwDesign=1'
, '--output-dir=' + process_dir], cwd=process_dir)
#subprocess.check_output(['bash', 'runPrintNetworks.sh', pdbname, str(n), process_dir])
print('\033[95mCOMPLETE: Print Networks (shell)\033[0m')
print('\033[95mSTAGE: Rename Mutation Files (shell)\033[0m')
for n in range(1,nmuts+1):
print('\033[95mMUTATION:\033[0m', n)
print('\033[96mCommand:\033[0m mutrenamefiles.sh %s %s %s' % (pdbname, str(n), process_dir ))
# FIXME: this is bad design and needs to be done in a pythonic way
with suppress(Exception):
subprocess.check_output(['bash', 'mutrenamefiles.sh', pdbname, str(n), process_dir])
print('\033[95mCOMPLETE: Rename Mutation Files (shell)\033[0m')
print('\033[95mSTAGE: Rename Files (shell)\033[0m')
# FIXME: this is bad design and needs to be done in a pythonic way
out = subprocess.check_output(['bash','renamefiles.sh', pdbname, process_dir])
print('\033[95mCOMPLETE: Rename Files (shell)\033[0m')
if comp=='y':
print('\033[95mSTAGE: Running foldx AnalyseComplex (subprocess)\033[0m')
chain1=chainA
chain2=chainB
#with suppress(Exception):
#subprocess.check_output(['bash','runcomplex.sh', pdbname, chain1, chain2, process_dir])
subprocess.call(['foldx'
, '--command=AnalyseComplex'
, '--pdb-dir=' + process_dir
, '--pdb=' + pdbname + '_Repair.pdb'
, '--analyseComplexChains=' + chain1 + ',' + chain2
, '--water=PREDICT'
, '--vdwDesign=1'
, '--output-dir=' + process_dir], cwd=process_dir)
# FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
ac_source = process_dir + '/Summary_' + pdbname + '_Repair_AC.fxout'
ac_dest = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
shutil.copyfile(ac_source, ac_dest)
for n in range(1,nmuts+1):
print('\033[95mSTAGE: Running foldx AnalyseComplex (subprocess) for mutation:\033[0m', n)
#with suppress(Exception):
# subprocess.check_output(['bash','mutruncomplex.sh', pdbname, chain1, chain2, str(n), process_dir])
subprocess.call(['foldx'
, '--command=AnalyseComplex'
, '--pdb-dir=' + process_dir
, '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
, '--analyseComplexChains=' + chain1 + ',' + chain2
, '--water=PREDICT'
, '--vdwDesign=1'
, '--output-dir=' + process_dir], cwd=process_dir)
# FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
ac_mut_source = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) +'_AC.fxout'
ac_mut_dest = process_dir + '/Summary_' + pdbname + '_Repair)' + str(n) +'_AC.txt'
shutil.copyfile(ac_mut_source, ac_mut_dest)
print('\033[95mCOMPLETE: foldx AnalyseComplex (subprocess) for mutation:\033[0m', n)
interactions = ['Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS',
'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM',
'VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']
dGdatafile = process_dir + '/Dif_' + pdbname + '_Repair.txt'
dGdata = pd.read_csv(dGdatafile, sep = '\t')
ddG=[]
print('ddG')
print(len(dGdata))
for i in range(0,len(dGdata)):
ddG.append(dGdata['total energy'].loc[i])
nint = len(interactions)
wt_int = []
for i in interactions:
filename = process_dir + '/Matrix_' + i + '_'+ pdbname + '_Repair_PN.txt'
wt_int.append(getInteractions(filename))
print('wt')
print(wt_int)
ntotal = nint+1
print(ntotal)
print(nmuts)
data = np.empty((ntotal,nmuts))
data[0] = ddG
print(data)
for i in range(0,len(interactions)):
d=[]
p=0
for n in range(1, nmuts+1):
print(i)
filename = process_dir + '/Matrix_' + interactions[i] + '_' + pdbname + '_Repair_' + str(n) + '_PN.txt'
mut = getInteractions(filename)
diff = wt_int[i] - mut
print(diff)
print(wt_int[i])
print(mut)
d.append(diff)
print(d)
data[i+1] = d
interactions = ['ddG', 'Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']
print(interactions)
IE = []
if comp=='y':
wtfilename = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
wtE = getInteractionEnergy(wtfilename)
print(wtE)
for n in range(1,nmuts+1):
print(n)
filename = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) + '_AC.txt'
mutE = getInteractionEnergy(filename)
print(mutE)
diff = wtE - mutE
print(diff)
IE.append(diff)
print(IE)
IEresults = pd.DataFrame(IE,columns = ['Interaction Energy'], index = mutlist)
IEfilename = 'foldx_complexresults_'+pdbname+'.csv'
IEresults.to_csv(IEfilename)
print(len(IE))
data = np.append(data,[IE], axis = 0)
print(data)
interactions = ['ddG','Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS','Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS','Interaction Energy']
mut_file = process_dir + '/individual_list_' + pdbname + '.txt'
with open(mut_file) as csvfile:
readCSV = csv.reader(csvfile)
mutlist = []
for row in readCSV:
mut = row[0]
mutlist.append(mut)
print(mutlist)
print(len(mutlist))
print(data)
results = pd.DataFrame(data, columns = mutlist, index = interactions)
results.append(ddG)
#print(results.head())
# my style formatted results
results2 = results.T # transpose df
results2.index.name = 'mutationinformation' # assign name to index
results2 = results2.reset_index() # turn it into a columns
results2['mutationinformation'] = results2['mutationinformation'].replace({r'([A-Z]{1})[A-Z]{1}([0-9]+[A-Z]{1});' : r'\1 \2'}, regex = True) # capture mcsm style muts (i.e not the chain id)
results2['mutationinformation'] = results2['mutationinformation'].str.replace(' ', '') # remove empty space
results2.rename(columns = {'Distances': 'Contacts'}, inplace = True)
# lower case columns
results2.columns = results2.columns.str.lower()
print('Writing file in the format below:\n'
, results2.head()
, '\nNo. of rows:', len(results2)
, '\nNo. of cols:', len(results2.columns))
outputfilename = outfile_foldx
#outputfilename = 'foldx_results_' + pdbname + '.csv'
#results.to_csv(outputfilename)
results2.to_csv(outputfilename, index = False)
if __name__ == '__main__':
main()

View file

@ -0,0 +1,3 @@
mutationinformation,ddg,contacts,electro_rr,electro_mm,electro_sm,electro_ss,disulfide_rr,disulfide_mm,disulfide_sm,disulfide_ss,hbonds_rr,hbonds_mm,hbonds_sm,hbonds_ss,partcov_rr,partcov_mm,partcov_sm,partcov_ss,vdwclashes_rr,vdwclashes_mm,vdwclashes_sm,vdwclashes_ss,volumetric_rr,volumetric_mm,volumetric_sm,volumetric_ss
S2C,0.30861700000000003,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,0.0,0.0
S2F,-0.6481899999999999,-8.0,-4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,-1.0,0.0,0.0
1 mutationinformation ddg contacts electro_rr electro_mm electro_sm electro_ss disulfide_rr disulfide_mm disulfide_sm disulfide_ss hbonds_rr hbonds_mm hbonds_sm hbonds_ss partcov_rr partcov_mm partcov_sm partcov_ss vdwclashes_rr vdwclashes_mm vdwclashes_sm vdwclashes_ss volumetric_rr volumetric_mm volumetric_sm volumetric_ss
2 S2C 0.30861700000000003 -2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -1.0 -1.0 0.0 0.0
3 S2F -0.6481899999999999 -8.0 -4.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 -1.0 -1.0 0.0 0.0

View file

@ -0,0 +1,3 @@
mutationinformation,ddg,contacts,electro_rr,electro_mm,electro_sm,electro_ss,disulfide_rr,disulfide_mm,disulfide_sm,disulfide_ss,hbonds_rr,hbonds_mm,hbonds_sm,hbonds_ss,partcov_rr,partcov_mm,partcov_sm,partcov_ss,vdwclashes_rr,vdwclashes_mm,vdwclashes_sm,vdwclashes_ss,volumetric_rr,volumetric_mm,volumetric_sm,volumetric_ss
L4S,5.7629,22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,4.0
L159R,1.66524,-56.0,-26.0,0.0,-2.0,-24.0,0.0,0.0,0.0,0.0,-2.0,0.0,-2.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,-1.0,-4.0,0.0,-4.0,0.0
1 mutationinformation ddg contacts electro_rr electro_mm electro_sm electro_ss disulfide_rr disulfide_mm disulfide_sm disulfide_ss hbonds_rr hbonds_mm hbonds_sm hbonds_ss partcov_rr partcov_mm partcov_sm partcov_ss vdwclashes_rr vdwclashes_mm vdwclashes_sm vdwclashes_ss volumetric_rr volumetric_mm volumetric_sm volumetric_ss
2 L4S 5.7629 22.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 6.0 0.0 0.0 4.0
3 L159R 1.66524 -56.0 -26.0 0.0 -2.0 -24.0 0.0 0.0 0.0 0.0 -2.0 0.0 -2.0 0.0 0.0 0.0 0.0 0.0 -1.0 0.0 0.0 -1.0 -4.0 0.0 -4.0 0.0

View file

@ -0,0 +1,34 @@
./runFoldx_test2.py -g pncA --datadir /home/tanu/git/LSHTM_analysis/foldx/test2 -i /home/tanu/git/LSHTM_analysis/foldx/test2 -o /home/tanu/git/LSHTM_analysis/foldx/test2/test2_output -p /home/tanu/git/LSHTM_analysis/foldx/test2/test2_process -pdb 3pl1.pdb -m pnca_muts_sample.csv -c1 A
============
# Example 1: pnca
# Delete processing output, copy rotabase.txt and individual_list_3pl1.txt in place, run a test
# get files from test/
============
#
clear; rm -rf test2_process/*; cp individual_list_3pl1.txt test2_process/ ; cp rotabase.txt test2_process/; ./runFoldx_test2.py -g pncA --datadir /home/tanu/git/LSHTM_analysis/foldx/test2 -i /home/tanu/git/LSHTM_analysis/foldx/test2 -o /home/tanu/git/LSHTM_analysis/foldx/test2/test2_output -p ./test2_process -pdb 3pl1.pdb -m /tmp/pnca_test_muts.csv -c1 A
============
# Example 2: gidb
============
clear
rm Unrecognized_molecules.txt
rm -rf test2_process/*
cp rotabase.txt test2_process/
./runFoldx.py \
-g gid \
--datadir /home/tanu/git/LSHTM_analysis/foldx/test2 \
-i /home/tanu/git/LSHTM_analysis/foldx/test2 \
-o /home/tanu/git/LSHTM_analysis/foldx/test2/test2_output \
-p ./test2_process \
-pdb gid_test2.pdb \
-m gid_test_snps.csv \
-c1 A
#==========
clear dir
#==========
rm Unrecognized_molecules.txt
find ~/git/LSHTM_analysis/foldx/test2/test2_process -type f -delete

View file

@ -0,0 +1,361 @@
#!/usr/bin/env python3
#=======================================================================
#TASK:
#=======================================================================
#%% load packages
import os,sys
import subprocess
import argparse
#import requests
import re
#import time
import pandas as pd
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
import numpy as np
from mcsm import *
#=======================================================================
#%% specify input and curr dir
homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
os.getcwd()
#=======================================================================
#%% variable assignment: input and output
#drug = 'pyrazinamide'
#gene = 'pncA'
drug = 'isoniazid'
gene = 'KatG'
#drug = args.drug
#gene = args.gene
gene_match = gene + '_p.'
#==========
# data dir
#==========
datadir = homedir + '/' + 'git/Data'
#=======
# input:
#=======
# 1) result_urls (from outdir)
outdir = datadir + '/' + drug + '/' + 'output'
in_filename = gene.lower() + '_mcsm_output.csv' #(outfile, from mcsm_results.py)
infile = outdir + '/' + in_filename
print('Input filename:', in_filename
, '\nInput path(from output dir):', outdir
, '\n=============================================================')
#=======
# output
#=======
outdir = datadir + '/' + drug + '/' + 'output'
out_filename = gene.lower() + '_complex_mcsm_results.csv'
outfile = outdir + '/' + out_filename
print('Output filename:', out_filename
, '\nOutput path:', outdir
, '\n=============================================================')
#%%=====================================================================
def format_mcsm_output(mcsm_outputcsv):
"""
@param mcsm_outputcsv: file containing mcsm results for all muts
which is the result of build_result_dict() being called for each
mutation and then converting to a pandas df and output as csv.
@type string
@return formatted mcsm output
@type pandas df
"""
#############
# Read file
#############
mcsm_data_raw = pd.read_csv(mcsm_outputcsv, sep = ',')
# strip white space from both ends in all columns
mcsm_data = mcsm_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
dforig_shape = mcsm_data.shape
print('dimensions of input file:', dforig_shape)
#############
# rename cols
#############
# format colnames: all lowercase, remove spaces and use '_' to join
print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units'
, '\n===================================================================')
my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded
, 'Mutation information': 'mutationinformation' # {wild_type}<position>{mutant_type}
, 'Wild-type': 'wild_type' # one letter amino acid code
, 'Position': 'position' # number
, 'Mutant-type': 'mutant_type' # one letter amino acid code
, 'Chain': 'chain' # single letter (caps)
, 'Ligand ID': 'ligand_id' # 3-letter code
, 'Distance to ligand': 'ligand_distance' # angstroms
, 'DUET stability change': 'duet_stability_change'} # in kcal/mol
mcsm_data.rename(columns = my_colnames_dict, inplace = True)
#%%===========================================================================
#################################
# populate mutationinformation
# col which is currently blank
#################################
# populate mutationinformation column:mcsm style muts {WT}<POS>{MUT}
print('Populating column : mutationinformation which is currently empty\n', mcsm_data['mutationinformation'])
mcsm_data['mutationinformation'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str) + mcsm_data['mutant_type']
print('checking after populating:\n', mcsm_data['mutationinformation']
, '\n===================================================================')
# Remove spaces b/w pasted columns
print('removing white space within column: \mutationinformation')
mcsm_data['mutationinformation'] = mcsm_data['mutationinformation'].str.replace(' ', '')
print('Correctly formatted column: mutationinformation\n', mcsm_data['mutationinformation']
, '\n===================================================================')
#%%===========================================================================
#############
# sanity check: drop dupliate muts
#############
# shouldn't exist as this should be eliminated at the time of running mcsm
print('Sanity check:'
, '\nChecking duplicate mutations')
if mcsm_data['mutationinformation'].duplicated().sum() == 0:
print('PASS: No duplicate mutations detected (as expected)'
, '\nDim of data:', mcsm_data.shape
, '\n===============================================================')
else:
print('FAIL (but not fatal): Duplicate mutations detected'
, '\nDim of df with duplicates:', mcsm_data.shape
, 'Removing duplicate entries')
mcsm_data = mcsm_data.drop_duplicates(['mutationinformation'])
print('Dim of data after removing duplicate muts:', mcsm_data.shape
, '\n===============================================================')
#%%===========================================================================
#############
# Create col: duet_outcome
#############
# classification based on DUET stability values
print('Assigning col: duet_outcome based on DUET stability values')
print('Sanity check:')
# count positive values in the DUET column
c = mcsm_data[mcsm_data['duet_stability_change']>=0].count()
DUET_pos = c.get(key = 'duet_stability_change')
# Assign category based on sign (+ve : Stabilising, -ve: Destabilising, Mind the spelling (British spelling))
mcsm_data['duet_outcome'] = np.where(mcsm_data['duet_stability_change']>=0, 'Stabilising', 'Destabilising')
mcsm_data['duet_outcome'].value_counts()
if DUET_pos == mcsm_data['duet_outcome'].value_counts()['Stabilising']:
print('PASS: DUET outcome assigned correctly')
else:
print('FAIL: DUET outcome assigned incorrectly'
, '\nExpected no. of stabilising mutations:', DUET_pos
, '\nGot no. of stabilising mutations', mcsm_data['duet_outcome'].value_counts()['Stabilising']
, '\n===============================================================')
#%%===========================================================================
#############
# Extract numeric
# part of ligand_distance col
#############
# Extract only the numeric part from col: ligand_distance
# number: '-?\d+\.?\d*'
mcsm_data['ligand_distance']
print('extracting numeric part of col: ligand_distance')
mcsm_data['ligand_distance'] = mcsm_data['ligand_distance'].str.extract('(\d+\.?\d*)')
mcsm_data['ligand_distance']
#%%===========================================================================
#############
# Create 2 columns:
# ligand_affinity_change and ligand_outcome
#############
# the numerical and categorical parts need to be extracted from column: PredAffLog
# regex used
# numerical part: '-?\d+\.?\d*'
# categorocal part: '\b(\w+ing)\b'
print('Extracting numerical and categorical parts from the col: PredAffLog')
print('to create two columns: ligand_affinity_change and ligand_outcome'
, '\n===================================================================')
# 1) Extracting the predicted affinity change (numerical part)
mcsm_data['ligand_affinity_change'] = mcsm_data['PredAffLog'].str.extract('(-?\d+\.?\d*)', expand = True)
print(mcsm_data['ligand_affinity_change'])
# 2) Extracting the categorical part (Destabillizing and Stabilizing) using word boundary ('ing')
#aff_regex = re.compile(r'\b(\w+ing)\b')
mcsm_data['ligand_outcome']= mcsm_data['PredAffLog'].str.extract(r'(\b\w+ing\b)', expand = True)
print(mcsm_data['ligand_outcome'])
print(mcsm_data['ligand_outcome'].value_counts())
#############
# changing spelling: British
#############
# ensuring spellings are consistent
american_spl = mcsm_data['ligand_outcome'].value_counts()
print('Changing to Bristish spellings for col: ligand_outcome')
mcsm_data['ligand_outcome'].replace({'Destabilizing': 'Destabilising', 'Stabilizing': 'Stabilising'}, inplace = True)
print(mcsm_data['ligand_outcome'].value_counts())
british_spl = mcsm_data['ligand_outcome'].value_counts()
# compare series values since index will differ from spelling change
check = american_spl.values == british_spl.values
if check.all():
print('PASS: spelling change successfull'
, '\nNo. of predicted affinity changes:\n', british_spl
, '\n===============================================================')
else:
print('FAIL: spelling change unsucessfull'
, '\nExpected:\n', american_spl
, '\nGot:\n', british_spl
, '\n===============================================================')
#%%===========================================================================
#############
# ensuring corrrect dtype columns
#############
# check dtype in cols
print('Checking dtypes in all columns:\n', mcsm_data.dtypes
, '\n===================================================================')
print('Converting the following cols to numeric:'
, '\nligand_distance'
, '\nduet_stability_change'
, '\nligand_affinity_change'
, '\n===================================================================')
# using apply method to change stabilty and affinity values to numeric
numeric_cols = ['duet_stability_change', 'ligand_affinity_change', 'ligand_distance']
mcsm_data[numeric_cols] = mcsm_data[numeric_cols].apply(pd.to_numeric)
# check dtype in cols
print('checking dtype after conversion')
cols_check = mcsm_data.select_dtypes(include='float64').columns.isin(numeric_cols)
if cols_check.all():
print('PASS: dtypes for selected cols:', numeric_cols
, '\nchanged to numeric'
, '\n===============================================================')
else:
print('FAIL:dtype change to numeric for selected cols unsuccessful'
, '\n===============================================================')
print(mcsm_data.dtypes)
#%%===========================================================================
#############
# scale duet values
#############
# Rescale values in DUET_change col b/w -1 and 1 so negative numbers
# stay neg and pos numbers stay positive
duet_min = mcsm_data['duet_stability_change'].min()
duet_max = mcsm_data['duet_stability_change'].max()
duet_scale = lambda x : x/abs(duet_min) if x < 0 else (x/duet_max if x >= 0 else 'failed')
mcsm_data['duet_scaled'] = mcsm_data['duet_stability_change'].apply(duet_scale)
print('Raw duet scores:\n', mcsm_data['duet_stability_change']
, '\n---------------------------------------------------------------'
, '\nScaled duet scores:\n', mcsm_data['duet_scaled'])
#%%===========================================================================
#############
# scale affinity values
#############
# rescale values in affinity change col b/w -1 and 1 so negative numbers
# stay neg and pos numbers stay positive
aff_min = mcsm_data['ligand_affinity_change'].min()
aff_max = mcsm_data['ligand_affinity_change'].max()
aff_scale = lambda x : x/abs(aff_min) if x < 0 else (x/aff_max if x >= 0 else 'failed')
mcsm_data['affinity_scaled'] = mcsm_data['ligand_affinity_change'].apply(aff_scale)
print('Raw affinity scores:\n', mcsm_data['ligand_affinity_change']
, '\n---------------------------------------------------------------'
, '\nScaled affinity scores:\n', mcsm_data['affinity_scaled'])
#=============================================================================
# Adding colname: wild_pos: sometimes useful for plotting and db
print('Creating column: wild_pos')
mcsm_data['wild_pos'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
print(mcsm_data['wild_pos'].head())
# Remove spaces b/w pasted columns
print('removing white space within column: wild_pos')
mcsm_data['wild_pos'] = mcsm_data['wild_pos'].str.replace(' ', '')
print('Correctly formatted column: wild_pos\n', mcsm_data['wild_pos'].head()
, '\n===================================================================')
#=============================================================================
# Adding colname: wild_chain_pos: sometimes useful for plotting and db and is explicit
print('Creating column: wild_chain_pos')
mcsm_data['wild_chain_pos'] = mcsm_data['wild_type'] + mcsm_data['chain'] + mcsm_data['position'].astype(str)
print(mcsm_data['wild_chain_pos'].head())
# Remove spaces b/w pasted columns
print('removing white space within column: wild_chain_pos')
mcsm_data['wild_chain_pos'] = mcsm_data['wild_chain_pos'].str.replace(' ', '')
print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'].head()
, '\n===================================================================')
#=============================================================================
#%% ensuring dtypes are string for the non-numeric cols
#) char cols
char_cols = ['PredAffLog', 'mutationinformation', 'wild_type', 'mutant_type', 'chain'
, 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos']
#mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
cols_check_char = mcsm_data.select_dtypes(include='object').columns.isin(char_cols)
if cols_check_char.all():
print('PASS: dtypes for char cols:', char_cols, 'are indeed string'
, '\n===============================================================')
else:
print('FAIL:dtype change to numeric for selected cols unsuccessful'
, '\n===============================================================')
#mcsm_data['ligand_distance', 'ligand_affinity_change'].apply(is_numeric_dtype(mcsm_data['ligand_distance', 'ligand_affinity_change']))
print(mcsm_data.dtypes)
#=============================================================================
# Removing PredAff log column as it is not needed?
print('Removing col: PredAffLog since relevant info has been extracted from it')
mcsm_data_f = mcsm_data.drop(columns = ['PredAffLog'])
#=============================================================================
#sort df by position for convenience
print('Sorting df by position')
mcsm_data_fs = mcsm_data_f.sort_values(by = ['position'])
print('sorted df:\n', mcsm_data_fs.head())
# Ensuring column names are lowercase before output
mcsm_data_fs.columns = mcsm_data_fs.columns.str.lower()
#%%===========================================================================
#############
# sanity check before writing file
#############
expected_ncols_toadd = 5 # beware of hardcoded numbers
dforig_len = dforig_shape[1]
expected_cols = dforig_len + expected_ncols_toadd
if len(mcsm_data_fs.columns) == expected_cols:
print('PASS: formatting successful'
, '\nformatted df has expected no. of cols:', expected_cols
, '\ncolnames:', mcsm_data_fs.columns
, '\n----------------------------------------------------------------'
, '\ndtypes in cols:', mcsm_data_fs.dtypes
, '\n----------------------------------------------------------------'
, '\norig data shape:', dforig_shape
, '\nformatted df shape:', mcsm_data_fs.shape
, '\n===============================================================')
else:
print('FAIL: something went wrong in formatting df'
, '\nLen of orig df:', dforig_len
, '\nExpected number of cols to add:', expected_ncols_toadd
, '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')'
, '\nGot no. of cols:', len(mcsm_data_fs.columns)
, '\nCheck formatting:'
, '\ncheck hardcoded value:', expected_ncols_toadd
, '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
, '\n===============================================================')
return mcsm_data_fs
#=======================================================================
# call function
mcsm_df_formatted = format_mcsm_output(infile)
# writing file
print('Writing formatted df to csv')
mcsm_df_formatted.to_csv(outfile, index = False)
print('Finished writing file:'
, '\nFile', outfile
, '\nExpected no. of rows:', len(mcsm_df_formatted)
, '\nExpected no. of cols:', len(mcsm_df_formatted)
, '\n=============================================================')
#%%
#End of script

View file

@ -0,0 +1,310 @@
#!/usr/bin/env python3
#=======================================================================
#TASK:
#=======================================================================
#%% load packages
import os,sys
import subprocess
import argparse
#import requests
import re
#import time
import pandas as pd
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
import numpy as np
#=======================================================================
#%% specify input and curr dir
homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
os.getcwd()
#=======================================================================
#%% variable assignment: input and output
drug = 'pyrazinamide'
gene = 'pncA'
gene_match = gene + '_p.'
#==========
# dirs
#==========
datadir = homedir + '/' + 'git/Data'
indir = datadir + '/' + drug + '/' + 'input'
outdir = datadir + '/' + drug + '/' + 'output'
#=======
# input:
#=======
# 1) result_urls (from outdir)
in_filename_mcsm_output = gene.lower() + '_mcsm_output.csv' #(outfile, from mcsm_results.py)
infile_mcsm_output = outdir + '/' + in_filename_mcsm_output
print('Input file:', infile_mcsm_output
, '\n=============================================================')
#=======
# output
#=======
out_filename_mcsm_norm = gene.lower() + '_complex_mcsm_norm.csv'
outfile_mcsm_norm = outdir + '/' + out_filename_mcsm_norm
print('Output file:', out_filename_mcsm_norm
, '\n=============================================================')
#=======================================================================
print('Reading input file')
mcsm_data_raw = pd.read_csv(infile_mcsm_output, sep = ',')
# strip white space from both ends in all columns
mcsm_data = mcsm_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
# PredAffLog = affinity_change_log
# "DUETStability_Kcalpermol = DUET_change_kcalpermol
dforig_shape = mcsm_data.shape
print('dim of infile:', dforig_shape)
#############
# rename cols
#############
# format colnames: all lowercase, remove spaces and use '_' to join
print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units'
, '\n===================================================================')
my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded
, 'Mutation information': 'mutationinformation' # {wild_type}<position>{mutant_type}
, 'Wild-type': 'wild_type' # one letter amino acid code
, 'Position': 'position' # number
, 'Mutant-type': 'mutant_type' # one letter amino acid code
, 'Chain': 'chain' # single letter (caps)
, 'Ligand ID': 'ligand_id' # 3-letter code
, 'Distance to ligand': 'ligand_distance' # angstroms
, 'DUET stability change': 'duet_stability_change'} # in kcal/mol
mcsm_data.rename(columns = my_colnames_dict, inplace = True)
#%%===========================================================================
# populate mutationinformation column:mcsm style muts {WT}<POS>{MUT}
print('Populating column : mutationinformation which is currently empty\n', mcsm_data['mutationinformation'])
mcsm_data['mutationinformation'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str) + mcsm_data['mutant_type']
print('checking after populating:\n', mcsm_data['mutationinformation']
, '\n===================================================================')
# Remove spaces b/w pasted columns: not needed as white space removed at the time of import
#print('removing white space within column: \mutationinformation')
#mcsm_data['mutationinformation'] = mcsm_data['mutationinformation'].str.replace(' ', '')
#print('Correctly formatted column: mutationinformation\n', mcsm_data['mutationinformation']
# , '\n===================================================================')
#%% Remove whitespace from column
#orig_dtypes = mcsm_data.dtypes
#https://stackoverflow.com/questions/33788913/pythonic-efficient-way-to-strip-whitespace-from-every-pandas-data-frame-cell-tha/33789292
#mcsm_data.columns = mcsm_data.columns.str.strip()
#new_dtypes = mcsm_data.dtypes
#%%===========================================================================
# very important
print('Sanity check:'
, '\nChecking duplicate mutations')
if mcsm_data['mutationinformation'].duplicated().sum() == 0:
print('PASS: No duplicate mutations detected (as expected)'
, '\nDim of data:', mcsm_data.shape
, '\n===============================================================')
else:
print('FAIL (but not fatal): Duplicate mutations detected'
, '\nDim of df with duplicates:', mcsm_data.shape
, 'Removing duplicate entries')
mcsm_data = mcsm_data.drop_duplicates(['mutationinformation'])
print('Dim of data after removing duplicate muts:', mcsm_data.shape
, '\n===============================================================')
#%%===========================================================================
# create duet_outcome column: classification based on DUET stability values
print('Assigning col: duet_outcome based on DUET stability values')
print('Sanity check:')
# count positive values in the DUET column
c = mcsm_data[mcsm_data['duet_stability_change']>=0].count()
DUET_pos = c.get(key = 'duet_stability_change')
# Assign category based on sign (+ve : Stabilising, -ve: Destabilising, Mind the spelling (British spelling))
mcsm_data['duet_outcome'] = np.where(mcsm_data['duet_stability_change']>=0, 'Stabilising', 'Destabilising')
mcsm_data['duet_outcome'].value_counts()
if DUET_pos == mcsm_data['duet_outcome'].value_counts()['Stabilising']:
print('PASS: DUET outcome assigned correctly')
else:
print('FAIL: DUET outcome assigned incorrectly'
, '\nExpected no. of stabilising mutations:', DUET_pos
, '\nGot no. of stabilising mutations', mcsm_data['duet_outcome'].value_counts()['Stabilising']
, '\n===============================================================')
#%%===========================================================================
# Extract only the numeric part from col: ligand_distance
# number: '-?\d+\.?\d*'
mcsm_data['ligand_distance']
print('extracting numeric part of col: ligand_distance')
mcsm_data['ligand_distance'] = mcsm_data['ligand_distance'].str.extract('(\d+\.?\d*)')
mcsm_data['ligand_distance']
#%%===========================================================================
# create ligand_outcome column: classification based on affinity change values
# the numerical and categorical parts need to be extracted from column: PredAffLog
# regex used
# number: '-?\d+\.?\d*'
# category: '\b(\w+ing)\b'
print('Extracting numerical and categorical parts from the col: PredAffLog')
print('to create two columns: ligand_affinity_change and ligand_outcome'
, '\n===================================================================')
# Extracting the predicted affinity change (numerical part)
mcsm_data['ligand_affinity_change'] = mcsm_data['PredAffLog'].str.extract('(-?\d+\.?\d*)', expand = True)
print(mcsm_data['ligand_affinity_change'])
# Extracting the categorical part (Destabillizing and Stabilizing) using word boundary ('ing')
#aff_regex = re.compile(r'\b(\w+ing)\b')
mcsm_data['ligand_outcome']= mcsm_data['PredAffLog'].str.extract(r'(\b\w+ing\b)', expand = True)
print(mcsm_data['ligand_outcome'])
print(mcsm_data['ligand_outcome'].value_counts())
# ensuring spellings are consistent
american_spl = mcsm_data['ligand_outcome'].value_counts()
print('Changing to Bristish spellings for col: ligand_outcome')
mcsm_data['ligand_outcome'].replace({'Destabilizing': 'Destabilising', 'Stabilizing': 'Stabilising'}, inplace = True)
print(mcsm_data['ligand_outcome'].value_counts())
british_spl = mcsm_data['ligand_outcome'].value_counts()
# compare series values since index will differ from spelling change
check = american_spl.values == british_spl.values
if check.all():
print('PASS: spelling change successfull'
, '\nNo. of predicted affinity changes:\n', british_spl
, '\n===============================================================')
else:
print('FAIL: spelling change unsucessfull'
, '\nExpected:\n', american_spl
, '\nGot:\n', british_spl
, '\n===============================================================')
#%%===========================================================================
# check dtype in cols: ensure correct dtypes for cols
print('Checking dtypes in all columns:\n', mcsm_data.dtypes
, '\n===================================================================')
#1) numeric cols
print('Converting the following cols to numeric:'
, '\nligand_distance'
, '\nduet_stability_change'
, '\nligand_affinity_change'
, '\n===================================================================')
# using apply method to change stabilty and affinity values to numeric
numeric_cols = ['duet_stability_change', 'ligand_affinity_change', 'ligand_distance']
mcsm_data[numeric_cols] = mcsm_data[numeric_cols].apply(pd.to_numeric)
# check dtype in cols
print('checking dtype after conversion')
cols_check = mcsm_data.select_dtypes(include='float64').columns.isin(numeric_cols)
if cols_check.all():
print('PASS: dtypes for selected cols:', numeric_cols
, '\nchanged to numeric'
, '\n===============================================================')
else:
print('FAIL:dtype change to numeric for selected cols unsuccessful'
, '\n===============================================================')
#mcsm_data['ligand_distance', 'ligand_affinity_change'].apply(is_numeric_dtype(mcsm_data['ligand_distance', 'ligand_affinity_change']))
print(mcsm_data.dtypes)
#%%===========================================================================
# Normalise values in DUET_change col b/w -1 and 1 so negative numbers
# stay neg and pos numbers stay positive
duet_min = mcsm_data['duet_stability_change'].min()
duet_max = mcsm_data['duet_stability_change'].max()
duet_scale = lambda x : x/abs(duet_min) if x < 0 else (x/duet_max if x >= 0 else 'failed')
mcsm_data['duet_scaled'] = mcsm_data['duet_stability_change'].apply(duet_scale)
print('Raw duet scores:\n', mcsm_data['duet_stability_change']
, '\n---------------------------------------------------------------'
, '\nScaled duet scores:\n', mcsm_data['duet_scaled'])
#%%===========================================================================
# Normalise values in affinity change col b/w -1 and 1 so negative numbers
# stay neg and pos numbers stay positive
aff_min = mcsm_data['ligand_affinity_change'].min()
aff_max = mcsm_data['ligand_affinity_change'].max()
aff_scale = lambda x : x/abs(aff_min) if x < 0 else (x/aff_max if x >= 0 else 'failed')
mcsm_data['ligand_affinity_change']
mcsm_data['affinity_scaled'] = mcsm_data['ligand_affinity_change'].apply(aff_scale)
mcsm_data['affinity_scaled']
print('Raw affinity scores:\n', mcsm_data['ligand_affinity_change']
, '\n---------------------------------------------------------------'
, '\nScaled affinity scores:\n', mcsm_data['affinity_scaled'])
#=============================================================================
# Adding colname: wild_pos: sometimes useful for plotting and db
print('Creating column: wild_pos')
mcsm_data['wild_pos'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
print(mcsm_data['wild_pos'].head())
# Remove spaces b/w pasted columns
print('removing white space within column: wild_position')
mcsm_data['wild_pos'] = mcsm_data['wild_pos'].str.replace(' ', '')
print('Correctly formatted column: wild_pos\n', mcsm_data['wild_pos'].head()
, '\n===================================================================')
#=============================================================================
#%% Adding colname: wild_chain_pos: sometimes useful for plotting and db and is explicit
print('Creating column: wild_chain_pos')
mcsm_data['wild_chain_pos'] = mcsm_data['wild_type'] + mcsm_data['chain'] + mcsm_data['position'].astype(str)
print(mcsm_data['wild_chain_pos'].head())
# Remove spaces b/w pasted columns
print('removing white space within column: wild_chain_pos')
mcsm_data['wild_chain_pos'] = mcsm_data['wild_chain_pos'].str.replace(' ', '')
print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'].head()
, '\n===================================================================')
#=============================================================================
#%% ensuring dtypes are string for the non-numeric cols
#) char cols
char_cols = ['PredAffLog', 'mutationinformation', 'wild_type', 'mutant_type', 'chain'
, 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos']
#mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
cols_check_char = mcsm_data.select_dtypes(include='object').columns.isin(char_cols)
if cols_check_char.all():
print('PASS: dtypes for char cols:', char_cols, 'are indeed string'
, '\n===============================================================')
else:
print('FAIL:dtype change to numeric for selected cols unsuccessful'
, '\n===============================================================')
#mcsm_data['ligand_distance', 'ligand_affinity_change'].apply(is_numeric_dtype(mcsm_data['ligand_distance', 'ligand_affinity_change']))
print(mcsm_data.dtypes)
#%%
#=============================================================================
#%% Removing PredAff log column as it is not needed?
print('Removing col: PredAffLog since relevant info has been extracted from it')
mcsm_data_f = mcsm_data.drop(columns = ['PredAffLog'])
print(mcsm_data_f.head())
#=============================================================================
#%% sort df by position for convenience
print('Sorting df by position')
mcsm_data_fs = mcsm_data_f.sort_values(by = ['position'])
print('sorted df:\n', mcsm_data_fs.head())
#%%===========================================================================
expected_ncols_toadd = 6 # beware of hardcoded numbers
dforig_len = dforig_shape[1]
expected_cols = dforig_len + expected_ncols_toadd
if len(mcsm_data_fs.columns) == expected_cols:
print('PASS: formatting successful'
, '\nformatted df has expected no. of cols:', expected_cols
, '\ncolnames:', mcsm_data_fs.columns
, '\n----------------------------------------------------------------'
, '\ndtypes in cols:', mcsm_data_fs.dtypes
, '\n----------------------------------------------------------------'
, '\norig data shape:', dforig_shape
, '\nformatted df shape:', mcsm_data_fs.shape
, '\n===============================================================')
else:
print('FAIL: something went wrong in formatting df'
, '\nLen of orig df:', dforig_len
, '\nExpected number of cols to add:', expected_ncols_toadd
, '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')'
, '\nGot no. of cols:', len(mcsm_data_fs.columns)
, '\nCheck formatting:'
, '\ncheck hardcoded value:', expected_ncols_toadd
, '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
, '\n===============================================================')
#%%============================================================================
# Ensuring column names are lowercase before output
mcsm_data_fs.columns = mcsm_data_fs.columns.str.lower()
# writing file
print('Writing formatted df to csv')
mcsm_data_fs.to_csv(outfile_mcsm_norm, index = False)
print('Finished writing file:'
, '\nFile:', outfile_mcsm_norm
, '\nExpected no. of rows:', len(mcsm_data_fs)
, '\nExpected no. of cols:', len(mcsm_data_fs.columns)
, '\n=============================================================')
#%%
#End of script

149
mcsm/ind_scripts/mcsm_results.py Executable file
View file

@ -0,0 +1,149 @@
#!/usr/bin/env python3
#=======================================================================
#TASK:
#=======================================================================
#%% load packages
import os,sys
import subprocess
import argparse
import requests
import re
import time
import pandas as pd
from bs4 import BeautifulSoup
#import beautifulsoup4
from csv import reader
#=======================================================================
#%% specify input and curr dir
homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
os.getcwd()
#=======================================================================
#%% variable assignment: input and output
#drug = 'pyrazinamide'
#gene = 'pncA'
#drug = 'isoniazid'
#gene = 'KatG'
drug = 'cycloserine'
gene = 'alr'
#drug = args.drug
#gene = args.gene
gene_match = gene + '_p.'
#==========
# data dir
#==========
datadir = homedir + '/' + 'git/Data'
#=======
# input:
#=======
# 1) result_urls (from outdir)
outdir = datadir + '/' + drug + '/' + 'output'
in_filename_url = gene.lower() + '_result_urls.txt' #(outfile, sub write_result_url)
infile_url = outdir + '/' + in_filename_url
print('Input filename:', in_filename_url
, '\nInput path(from output dir):', outdir
, '\n=============================================================')
#=======
# output
#=======
outdir = datadir + '/' + drug + '/' + 'output'
out_filename = gene.lower() + '_mcsm_output.csv'
outfile = outdir + '/' + out_filename
print('Output filename:', out_filename
, '\nOutput path:', outdir
, '\n=============================================================')
#=======================================================================
def scrape_results(out_result_url):
"""
Extract results data using the result url
@params out_result_url: txt file containing result url
one per line for each mutation
@type string
returns: mcsm prediction results (raw)
@type chr
"""
result_response = requests.get(out_result_url)
# if results_response is not None:
# page = results_page.text
if result_response.status_code == 200:
print('SUCCESS: Fetching results')
else:
print('FAIL: Could not fetch results'
, '\nCheck if url is valid')
# extract results using the html parser
soup = BeautifulSoup(result_response.text, features = 'html.parser')
# print(soup)
web_result_raw = soup.find(class_ = 'span4').get_text()
return web_result_raw
def build_result_dict(web_result_raw):
"""
Build dict of mcsm output for a single mutation
Format web results which is preformatted to enable building result dict
# preformatted string object: Problematic!
# make format consistent
@params web_result_raw: directly from html parser extraction
@type string
@returns result dict
@type {}
"""
# remove blank lines from web_result_raw
mytext = os.linesep.join([s for s in web_result_raw.splitlines() if s])
# affinity change and DUET stability change cols are are split over
# multiple lines and Mutation information is empty!
mytext = mytext.replace('ange:\n', 'ange: ')
#print(mytext)
# initiliase result_dict
result_dict = {}
for line in mytext.split('\n'):
fields = line.split(':')
# print(fields)
if len(fields) > 1: # since Mutaton information is empty
dict_entry = dict([(x, y) for x, y in zip(fields[::2], fields[1::2])])
result_dict.update(dict_entry)
return result_dict
#=====================================================================
#%% call function
#request_results(infile_url)
#response = requests.get('http://biosig.unimelb.edu.au/mcsm_lig/results_prediction/1586364780.41')
results_interim = scrape_results('http://biosig.unimelb.edu.au/mcsm_lig/results_prediction/1587053996.55')
result_dict = build_result_dict(results_interim)
output_df = pd.DataFrame()
url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
infile_len = os.popen('wc -l < %s' % infile_url).read() # quicker than using Python :-)
print('Total URLs:',infile_len)
with open(infile_url, 'r') as urlfile:
for line in urlfile:
url_line = line.strip()
# response = request_results(url_line)
#response = requests.get(url_line)
results_interim = scrape_results(url_line)
result_dict = build_result_dict(results_interim)
print('Processing URL: %s of %s' % (url_counter, infile_len))
df = pd.DataFrame(result_dict, index=[url_counter])
url_counter += 1
output_df = output_df.append(df)
#print(output_df)
output_df.to_csv(outfile, index = None, header = True)

240
mcsm/ind_scripts/run_mcsm.py Executable file
View file

@ -0,0 +1,240 @@
#!/usr/bin/env python3
#=======================================================================
#TASK:
#=======================================================================
#%% load packages
import os,sys
import subprocess
import argparse
import requests
import re
import time
import pandas as pd
from bs4 import BeautifulSoup
#from csv import reader
#=======================================================================
#%% specify input and curr dir
homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
os.getcwd()
#=======================================================================
#%% command line args
#arg_parser = argparse.ArgumentParser()
#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide')
#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive
#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'TESTDRUG')
#arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = 'testGene') # case sensitive
#args = arg_parser.parse_args()
#=======================================================================
#%% variable assignment: input and output
#drug = 'pyrazinamide'
#gene = 'pncA'
#drug = 'isoniazid'
#gene = 'KatG'
drug = 'cycloserine'
gene = 'alr'
#drug = args.drug
#gene = args.gene
gene_match = gene + '_p.'
#==========
# data dir
#==========
datadir = homedir + '/' + 'git/Data'
#==========
# input dir
#==========
indir = datadir + '/' + drug + '/' + 'input'
#==========
# output dir
#==========
outdir = datadir + '/' + drug + '/' + 'output'
#=======
# input files:
#=======
# 1) pdb file
in_filename_pdb = gene.lower() + '_complex.pdb'
infile_pdb = indir + '/' + in_filename_pdb
print('Input pdb file:', infile_pdb
, '\n=============================================================')
# 2) mcsm snps
in_filename_snps = gene.lower() + '_mcsm_snps.csv' #(outfile2, from data_extraction.py)
infile_snps = outdir + '/' + in_filename_snps
print('Input mutation file:', infile_snps
, '\n=============================================================')
#=======
# output files
#=======
# 1) result urls file
#result_urls_filename = gene.lower() + '_result_urls.txt'
#result_urls = outdir + '/' + result_urls_filename
# 2) invalid mutations file
#invalid_muts_filename = gene.lower() + '_invalid_mutations.txt'
#outfile_invalid_muts = outdir + '/' + invalid_muts_filename
#print('Result url file:', result_urls
# , '\n==================================================================='
# , '\nOutput invalid muations file:', outfile_invalid_muts
# , '\n===================================================================')
#%% global variables
host = "http://biosig.unimelb.edu.au"
prediction_url = f"{host}/mcsm_lig/prediction"
#=======================================================================
def format_data(data_file):
"""
Read file containing SNPs for mcsm analysis and remove duplicates
@param data_file csv file containing nsSNPs for given drug and gene.
csv file format:
single column with no headers with nsSNP format as below:
A1B
B2C
@type data_file: string
@return unique SNPs
@type list
"""
data = pd.read_csv(data_file, header = None, index_col = False)
data = data.drop_duplicates()
mutation_list = data[0].tolist()
# print(data.head())
return mutation_list
def request_calculation(pdb_file, mutation, chain, ligand_id, wt_affinity, prediction_url, output_dir, gene_name):
"""
Makes a POST request for a ligand affinity prediction.
@param pdb_file: valid path to pdb structure
@type string
@param mutation: single mutation of the format: {WT}<POS>{Mut}
@type string
@param chain: single-letter(caps)
@type chr
@param lig_id: 3-letter code (should match pdb file)
@type string
@param wt affinity: in nM
@type number
@param prediction_url: mcsm url for prediction
@type string
@return response object
@type object
"""
with open(pdb_file, "rb") as pdb_file:
files = {"wild": pdb_file}
body = {
"mutation": mutation,
"chain": chain,
"lig_id": ligand_id,
"affin_wt": wt_affinity
}
response = requests.post(prediction_url, files = files, data = body)
# print(response.status_code)
# result_status = response.raise_for_status()
if response.history:
# if result_status is not None: # doesn't work!
print('PASS: valid mutation submitted. Fetching result url')
# response = requests.post(prediction_url, files = files, data = body)
# return response
url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', response.text)
url = host + url_match.group()
#===============
# writing file: result urls
#===============
out_url_file = output_dir + '/' + gene_name.lower() + '_result_urls.txt'
myfile = open(out_url_file, 'a')
myfile.write(url + '\n')
myfile.close()
else:
print('ERROR: invalid mutation! Wild-type residue doesn\'t match pdb file.'
, '\nSkipping to the next mutation in file...')
#===============
# writing file: invalid mutations
#===============
out_error_file = output_dir + '/' + gene_name.lower() + '_errors.txt'
failed_muts = open(out_error_file, 'a')
failed_muts.write(mutation + '\n')
failed_muts.close()
#def write_result_url(holding_page, out_result_url, host):
# """
# Extract and write results url from the holding page returned after
# requesting a calculation.
# @param holding_page: response object containinig html content
# @type object
# @param out_result_url: txt file containing urls for mcsm results
# @type string
# @param host: mcsm server name
# @type string
# @return None, writes a file containing result urls (= total no. of muts)
# """
# if holding_page:
# url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', holding_page.text)
# url = host + url_match.group()
#===============
# writing file
#===============
# myfile = open(out_result_url, 'a')
# myfile.write(url+'\n')
# myfile.close()
# print(myfile)
# return url
#%%
#=======================================================================
# variables to run mcsm lig predictions
#pdb_file = infile_snps_pdb
my_chain = 'A'
my_ligand_id = 'DCS'
my_affinity = 10
print('Result urls and error file (if any) will be written in: ', outdir)
# call function to format data to remove duplicate snps before submitting job
mcsm_muts = format_data(infile_snps)
mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-)
print('Total SNPs for', gene, ':', infile_snps_len)
for mcsm_mut in mcsm_muts:
print('Processing mutation: %s of %s' % (mut_count, infile_snps_len), mcsm_mut)
print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)
# function call: to request mcsm prediction
# which writes file containing url for valid submissions and invalid muts to respective files
holding_page = request_calculation(infile_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)
# holding_page = request_calculation(infile_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)
time.sleep(1)
mut_count += 1
# result_url = write_result_url(holding_page, result_urls, host)
print('Request submitted'
, '\nCAUTION: Processing will take at least ten'
, 'minutes, but will be longer for more mutations.')
#%%

494
mcsm/mcsm.py Normal file
View file

@ -0,0 +1,494 @@
#%% load packages
import os,sys
import subprocess
import argparse
import requests
import re
import time
from bs4 import BeautifulSoup
import pandas as pd
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
import numpy as np
#from csv import reader
from mcsm import *
#==============================
#%% global variables for defs
#==============================
#%%
def format_data(data_file):
"""
Read file containing SNPs for mcsm analysis and remove duplicates
@param data_file csv file containing nsSNPs for given drug and gene.
csv file format:
single column with no headers with nsSNP format as below:
A1B
B2C
@type data_file: string
@return unique SNPs
@type list
"""
data = pd.read_csv(data_file, header = None, index_col = False)
data = data.drop_duplicates()
mutation_list = data[0].tolist()
# print(data.head())
return mutation_list
# FIXME: documentation
def request_calculation(pdb_file, mutation, chain, ligand_id, wt_affinity, prediction_url, output_dir, gene_name, host):
"""
Makes a POST request for a ligand affinity prediction.
@param pdb_file: valid path to pdb structure
@type string
@param mutation: single mutation of the format: {WT}<POS>{Mut}
@type string
@param chain: single-letter(caps)
@type chr
@param lig_id: 3-letter code (should match pdb file)
@type string
@param wt affinity: in nM
@type number
@param prediction_url: mcsm url for prediction
@type string
@return response object
@type object
"""
with open(pdb_file, "rb") as pdb_file:
files = {"wild": pdb_file}
body = {
"mutation": mutation,
"chain": chain,
"lig_id": ligand_id,
"affin_wt": wt_affinity
}
response = requests.post(prediction_url, files = files, data = body)
#print(response.status_code)
#result_status = response.raise_for_status()
if response.history:
# if result_status is not None: # doesn't work!
print('PASS: valid mutation submitted. Fetching result url')
#return response
url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', response.text)
url = host + url_match.group()
#===============
# writing file: result urls
#===============
out_url_file = output_dir + '/' + gene_name.lower() + '_result_urls.txt'
myfile = open(out_url_file, 'a')
myfile.write(url + '\n')
myfile.close()
else:
print('ERROR: invalid mutation! Wild-type residue doesn\'t match pdb file.'
, '\nSkipping to the next mutation in file...')
#===============
# writing file: invalid mutations
#===============
out_error_file = output_dir + '/' + gene_name.lower() + '_errors.txt'
failed_muts = open(out_error_file, 'a')
failed_muts.write(mutation + '\n')
failed_muts.close()
#=======================================================================
def scrape_results(result_url):
"""
Extract results data using the result url
@params result_url: txt file containing result url
one per line for each mutation
@type string
returns: mcsm prediction results (raw)
@type chr
"""
result_response = requests.get(result_url)
# if results_response is not None:
# page = results_page.text
if result_response.status_code == 200:
print('Fetching results')
# extract results using the html parser
soup = BeautifulSoup(result_response.text, features = 'html.parser')
# print(soup)
web_result_raw = soup.find(class_ = 'span4').get_text()
#metatags = soup.find_all('meta')
metatags = soup.find_all('meta', attrs={'http-equiv':'refresh'})
#print('meta tags:', metatags)
if metatags:
print('WARNING: Submission not ready for URL:', result_url)
# TODO: Add logging
#if debug:
# debug.warning('submission not ready for URL:', result_url)
else:
return web_result_raw
else:
sys.exit('FAIL: Could not fetch results'
, '\nCheck if url is valid')
def build_result_dict(web_result_raw):
"""
Build dict of mcsm output for a single mutation
Format web results which is preformatted to enable building result dict
# preformatted string object: Problematic!
# make format consistent
@params web_result_raw: directly from html parser extraction
@type string
@returns result dict
@type {}
"""
# remove blank lines from web_result_raw
mytext = os.linesep.join([s for s in web_result_raw.splitlines() if s])
# affinity change and DUET stability change cols are are split over
# multiple lines and Mutation information is empty!
mytext = mytext.replace('ange:\n', 'ange: ')
#print(mytext)
# initiliase result_dict
result_dict = {}
for line in mytext.split('\n'):
fields = line.split(':')
#print(fields)
if len(fields) > 1: # since Mutaton information is empty
dict_entry = dict([(x, y) for x, y in zip(fields[::2], fields[1::2])])
result_dict.update(dict_entry)
print(result_dict)
return result_dict
#%%
#=======================================================================
def format_mcsm_output(mcsm_outputcsv):
"""
@param mcsm_outputcsv: file containing mcsm results for all muts
which is the result of build_result_dict() being called for each
mutation and then converting to a pandas df and output as csv.
@type string
@return formatted mcsm output
@type pandas df
"""
#############
# Read file
#############
mcsm_data_raw = pd.read_csv(mcsm_outputcsv, sep = ',')
# strip white space from both ends in all columns
mcsm_data = mcsm_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
dforig_shape = mcsm_data.shape
print('dimensions of input file:', dforig_shape)
#############
# rename cols
#############
# format colnames: all lowercase, remove spaces and use '_' to join
print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units'
, '\n=======================================================')
my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded
, 'Mutation information': 'mutationinformation' # {wild_type}<position>{mutant_type}
, 'Wild-type': 'wild_type' # one letter amino acid code
, 'Position': 'position' # number
, 'Mutant-type': 'mutant_type' # one letter amino acid code
, 'Chain': 'chain' # single letter (caps)
, 'Ligand ID': 'ligand_id' # 3-letter code
, 'Distance to ligand': 'ligand_distance' # angstroms
, 'DUET stability change': 'duet_stability_change'} # in kcal/mol
mcsm_data.rename(columns = my_colnames_dict, inplace = True)
#%%=====================================================================
#################################
# populate mutationinformation
# col which is currently blank
#################################
# populate mutationinformation column:mcsm style muts {WT}<POS>{MUT}
print('Populating column : mutationinformation which is currently empty\n', mcsm_data['mutationinformation'])
mcsm_data['mutationinformation'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str) + mcsm_data['mutant_type']
print('checking after populating:\n', mcsm_data['mutationinformation']
, '\n=======================================================')
# Remove spaces b/w pasted columns
print('removing white space within column: \mutationinformation')
mcsm_data['mutationinformation'] = mcsm_data['mutationinformation'].str.replace(' ', '')
print('Correctly formatted column: mutationinformation\n', mcsm_data['mutationinformation']
, '\n=======================================================')
#%%=====================================================================
#############
# sanity check: drop dupliate muts
#############
# shouldn't exist as this should be eliminated at the time of running mcsm
print('Sanity check:'
, '\nChecking duplicate mutations')
if mcsm_data['mutationinformation'].duplicated().sum() == 0:
print('PASS: No duplicate mutations detected (as expected)'
, '\nDim of data:', mcsm_data.shape
, '\n===================================================')
else:
print('WARNING: Duplicate mutations detected'
, '\nDim of df with duplicates:', mcsm_data.shape
, 'Removing duplicate entries')
mcsm_data = mcsm_data.drop_duplicates(['mutationinformation'])
print('Dim of data after removing duplicate muts:', mcsm_data.shape
, '\n===========================================================')
#%%=====================================================================
#############
# Create col: duet_outcome
#############
# classification based on DUET stability values
print('Assigning col: duet_outcome based on DUET stability values')
print('Sanity check:')
# count positive values in the DUET column
c = mcsm_data[mcsm_data['duet_stability_change']>=0].count()
DUET_pos = c.get(key = 'duet_stability_change')
# Assign category based on sign (+ve : Stabilising, -ve: Destabilising, Mind the spelling (British spelling))
mcsm_data['duet_outcome'] = np.where(mcsm_data['duet_stability_change']>=0, 'Stabilising', 'Destabilising')
print('DUET Outcome:', mcsm_data['duet_outcome'].value_counts())
#if DUET_pos == mcsm_data['duet_outcome'].value_counts()['Stabilising']:
# print('PASS: DUET outcome assigned correctly')
#else:
# print('FAIL: DUET outcome assigned incorrectly'
# , '\nExpected no. of stabilising mutations:', DUET_pos
# , '\nGot no. of stabilising mutations', mcsm_data['duet_outcome'].value_counts()['Stabilising']
# , '\n======================================================')
#%%=====================================================================
#############
# Extract numeric
# part of ligand_distance col
#############
# Extract only the numeric part from col: ligand_distance
# number: '-?\d+\.?\d*'
mcsm_data['ligand_distance']
print('extracting numeric part of col: ligand_distance')
mcsm_data['ligand_distance'] = mcsm_data['ligand_distance'].str.extract('(\d+\.?\d*)')
print('Ligand Distance:',mcsm_data['ligand_distance'])
#%%=====================================================================
#############
# Create 2 columns:
# ligand_affinity_change and ligand_outcome
#############
# the numerical and categorical parts need to be extracted from column: PredAffLog
# regex used
# numerical part: '-?\d+\.?\d*'
# categorocal part: '\b(\w+ing)\b'
print('Extracting numerical and categorical parts from the col: PredAffLog')
print('to create two columns: ligand_affinity_change and ligand_outcome'
, '\n=======================================================')
# 1) Extracting the predicted affinity change (numerical part)
mcsm_data['ligand_affinity_change'] = mcsm_data['PredAffLog'].str.extract('(-?\d+\.?\d*)', expand = True)
print(mcsm_data['ligand_affinity_change'])
# 2) Extracting the categorical part (Destabillizing and Stabilizing) using word boundary ('ing')
#aff_regex = re.compile(r'\b(\w+ing)\b')
mcsm_data['ligand_outcome']= mcsm_data['PredAffLog'].str.extract(r'(\b\w+ing\b)', expand = True)
print(mcsm_data['ligand_outcome'])
print(mcsm_data['ligand_outcome'].value_counts())
#############
# changing spelling: British
#############
# ensuring spellings are consistent
american_spl = mcsm_data['ligand_outcome'].value_counts()
print('Changing to Bristish spellings for col: ligand_outcome')
mcsm_data['ligand_outcome'].replace({'Destabilizing': 'Destabilising', 'Stabilizing': 'Stabilising'}, inplace = True)
print(mcsm_data['ligand_outcome'].value_counts())
british_spl = mcsm_data['ligand_outcome'].value_counts()
# compare series values since index will differ from spelling change
check = american_spl.values == british_spl.values
if check.all():
print('PASS: spelling change successfull'
, '\nNo. of predicted affinity changes:\n', british_spl
, '\n===================================================')
else:
sys.exit('FAIL: spelling change unsucessfull'
, '\nExpected:\n', american_spl
, '\nGot:\n', british_spl
, '\n===================================================')
#%%=====================================================================
#############
# ensuring corrrect dtype for numeric columns
#############
# check dtype in cols
print('Checking dtypes in all columns:\n', mcsm_data.dtypes
, '\n=======================================================')
print('Converting the following cols to numeric:'
, '\nligand_distance'
, '\nduet_stability_change'
, '\nligand_affinity_change'
, '\n=======================================================')
# using apply method to change stabilty and affinity values to numeric
numeric_cols = ['duet_stability_change', 'ligand_affinity_change', 'ligand_distance']
mcsm_data[numeric_cols] = mcsm_data[numeric_cols].apply(pd.to_numeric)
# check dtype in cols
print('checking dtype after conversion')
cols_check = mcsm_data.select_dtypes(include='float64').columns.isin(numeric_cols)
if cols_check.all():
print('PASS: dtypes for selected cols:', numeric_cols
, '\nchanged to numeric'
, '\n===================================================')
else:
sys.exit('FAIL:dtype change to numeric for selected cols unsuccessful'
, '\n===================================================')
print(mcsm_data.dtypes)
#%%=====================================================================
#############
# scale duet values
#############
# Rescale values in DUET_change col b/w -1 and 1 so negative numbers
# stay neg and pos numbers stay positive
duet_min = mcsm_data['duet_stability_change'].min()
duet_max = mcsm_data['duet_stability_change'].max()
duet_scale = lambda x : x/abs(duet_min) if x < 0 else (x/duet_max if x >= 0 else 'failed')
mcsm_data['duet_scaled'] = mcsm_data['duet_stability_change'].apply(duet_scale)
print('Raw duet scores:\n', mcsm_data['duet_stability_change']
, '\n---------------------------------------------------------------'
, '\nScaled duet scores:\n', mcsm_data['duet_scaled'])
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# additional check added
c2 = mcsm_data[mcsm_data['duet_scaled']>=0].count()
DUET_pos2 = c2.get(key = 'duet_scaled')
if DUET_pos == DUET_pos2:
print('\nPASS: DUET values scaled correctly')
else:
print('\nFAIL: DUET values scaled numbers MISmatch'
, '\nExpected number:', DUET_pos
, '\nGot:', DUET_pos2
, '\n======================================================')
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#%%=====================================================================
#############
# scale affinity values
#############
# rescale values in affinity change col b/w -1 and 1 so negative numbers
# stay neg and pos numbers stay positive
aff_min = mcsm_data['ligand_affinity_change'].min()
aff_max = mcsm_data['ligand_affinity_change'].max()
aff_scale = lambda x : x/abs(aff_min) if x < 0 else (x/aff_max if x >= 0 else 'failed')
mcsm_data['affinity_scaled'] = mcsm_data['ligand_affinity_change'].apply(aff_scale)
print('Raw affinity scores:\n', mcsm_data['ligand_affinity_change']
, '\n---------------------------------------------------------------'
, '\nScaled affinity scores:\n', mcsm_data['affinity_scaled'])
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# additional check added
c_lig = mcsm_data[mcsm_data['ligand_affinity_change']>=0].count()
Lig_pos = c_lig.get(key = 'ligand_affinity_change')
c_lig2 = mcsm_data[mcsm_data['affinity_scaled']>=0].count()
Lig_pos2 = c_lig2.get(key = 'affinity_scaled')
if Lig_pos == Lig_pos2:
print('\nPASS: Ligand affintiy values scaled correctly')
else:
print('\nFAIL: Ligand affinity values scaled numbers MISmatch'
, '\nExpected number:', Lig_pos
, '\nGot:', Lig_pos2
, '\n======================================================')
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#%%=====================================================================
#############
# adding column: wild_pos
# useful for plots and db
#############
print('Creating column: wild_pos')
mcsm_data['wild_pos'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
print(mcsm_data['wild_pos'].head())
# Remove spaces b/w pasted columns
print('removing white space within created column: wild_pos')
mcsm_data['wild_pos'] = mcsm_data['wild_pos'].str.replace(' ', '')
print('Correctly formatted column: wild_pos\n', mcsm_data['wild_pos'].head()
, '\n=========================================================')
#%%=====================================================================
#############
# adding column: wild_chain_pos
# useful for plots and db and its explicit
#############
print('Creating column: wild_chain_pos')
mcsm_data['wild_chain_pos'] = mcsm_data['wild_type'] + mcsm_data['chain'] + mcsm_data['position'].astype(str)
print(mcsm_data['wild_chain_pos'].head())
# Remove spaces b/w pasted columns
print('removing white space within created column: wild_chain_pos')
mcsm_data['wild_chain_pos'] = mcsm_data['wild_chain_pos'].str.replace(' ', '')
print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'].head()
, '\n=========================================================')
#%%=====================================================================
#############
# ensuring corrrect dtype in non-numeric cols
#############
#) char cols
char_cols = ['PredAffLog', 'mutationinformation', 'wild_type', 'mutant_type', 'chain', 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos']
#mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
cols_check_char = mcsm_data.select_dtypes(include = 'object').columns.isin(char_cols)
if cols_check_char.all():
print('PASS: dtypes for char cols:', char_cols, 'are indeed string'
, '\n===================================================')
else:
sys.exit('FAIL:dtype change to numeric for selected cols unsuccessful'
, '\n===================================================')
#mcsm_data['ligand_distance', 'ligand_affinity_change'].apply(is_numeric_dtype(mcsm_data['ligand_distance', 'ligand_affinity_change']))
print(mcsm_data.dtypes)
#%%=====================================================================
# Removing PredAff log column as it is not needed?
print('Removing col: PredAffLog since relevant info has been extracted from it')
mcsm_data_f = mcsm_data.drop(columns = ['PredAffLog'])
#%%=====================================================================
# sort df by position for convenience
print('Sorting df by position')
mcsm_data_fs = mcsm_data_f.sort_values(by = ['position'])
print('sorted df:\n', mcsm_data_fs.head())
# Ensuring column names are lowercase before output
mcsm_data_fs.columns = mcsm_data_fs.columns.str.lower()
#%%=====================================================================
#############
# sanity check before writing file
#############
expected_ncols_toadd = 6 # beware hardcoding!
dforig_len = dforig_shape[1]
expected_cols = dforig_len + expected_ncols_toadd
if len(mcsm_data_fs.columns) == expected_cols:
print('PASS: formatting successful'
, '\nformatted df has expected no. of cols:', expected_cols
, '\n---------------------------------------------------'
, '\ncolnames:', mcsm_data_fs.columns
, '\n---------------------------------------------------'
, '\ndtypes in cols:', mcsm_data_fs.dtypes
, '\n---------------------------------------------------'
, '\norig data shape:', dforig_shape
, '\nformatted df shape:', mcsm_data_fs.shape
, '\n===================================================')
else:
print('FAIL: something went wrong in formatting df'
, '\nLen of orig df:', dforig_len
, '\nExpected number of cols to add:', expected_ncols_toadd
, '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')'
, '\nGot no. of cols:', len(mcsm_data_fs.columns)
, '\nCheck formatting:'
, '\ncheck hardcoded value:', expected_ncols_toadd
, '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
, '\n===================================================')
sys.exit()
return mcsm_data_fs

219
mcsm/run_mcsm.py Executable file
View file

@ -0,0 +1,219 @@
#!/usr/bin/env python3
# mCSM Wrapper
import os,sys
import subprocess
import argparse
import pandas as pd
from mcsm import *
#%% command line args
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug', help='drug name' , required=True)
arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', required=True) # case sensitive
arg_parser.add_argument('-s', '--stage', help='mCSM Pipeline Stage', default = 'get', choices=['submit', 'get', 'format'], required=True)
arg_parser.add_argument('-H', '--host', help='mCSM Server', default = 'http://biosig.unimelb.edu.au')
arg_parser.add_argument('-U', '--url', help='mCSM Server URL', default = 'http://biosig.unimelb.edu.au/mcsm_lig/prediction')
arg_parser.add_argument('-c', '--chain', help='Chain ID as per PDB, Case sensitive', default = 'A')
arg_parser.add_argument('-l','--ligand', help='Ligand ID as per PDB, Case sensitive. REQUIRED only in "submit" stage', default = None)
arg_parser.add_argument('-a','--affinity', help='Affinity in nM. REQUIRED only in "submit" stage', default = 10) #0.99 for pnca, gid, embb. For SP targets (alr,katg, rpob), use 10.
arg_parser.add_argument('-pdb','--pdb_file', help = 'PDB File')
arg_parser.add_argument('-m','--mutation_file', help = 'Mutation File, mcsm style')
arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
# stage: submit, output url file
arg_parser.add_argument('--url_file', help = 'Output results url file. The result of stage "submit". By default, it creates a output result url file in the output dir: "output_dir + gene.lower() + _result_urls.txt" ')
# stage: get, intermediate mcsm output file
arg_parser.add_argument('--outfile_scraped', help = 'Output mcsm results scraped. The result of stage "get". By default, it creates an interim output file in the output dir: "output_dir + gene.lower() +_mcsm_output.csv" ')
# stage: format, formatted output with scaled values, etc
# FIXME: Don't call this stage until you have ALL the interim results for your snps as the normalisation will be affected!
arg_parser.add_argument('--outfile_formatted', help = 'Output mcsm results formatted. The result of stage "format". By default, it creates a formatted output file in the output dir: "output_dir + gene.lower() + _complex_mcsm_norm.csv" ')
arg_parser.add_argument('--debug', action='store_true', help = 'Debug Mode')
args = arg_parser.parse_args()
#=======================================================================
#%% variables
#host = "http://biosig.unimelb.edu.au"
#prediction_url = f"{host}/mcsm_lig/prediction"
#drug = ''
#gene = ''
#%%=====================================================================
# Command line options
gene = args.gene
drug = args.drug
stage = args.stage
chain = args.chain
ligand = args.ligand
affinity = args.affinity
pdb_filename = args.pdb_file
mutation_filename = args.mutation_file
result_urls = args.url_file
mcsm_output = args.outfile_scraped
outfile_format = args.outfile_formatted
datadir = args.datadir
indir = args.input_dir
outdir = args.output_dir
DEBUG = args.debug
# Actual Globals :-)
host = args.host
prediction_url = args.url
# submit_mcsm globals
homedir = os.path.expanduser('~')
#os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
gene_match = gene + '_p.'
#============
# directories
#============
if not datadir:
datadir = homedir + '/git/Data/'
if not indir:
indir = datadir + drug + 'input/'
if not outdir:
outdir = datadir + drug + 'output/'
#=======
# input
#=======
if pdb_filename:
in_filename_pdb = pdb_filename
else:
in_filename_pdb = gene.lower() + '_complex.pdb'
infile_pdb = indir + in_filename_pdb
#in_filename_snps = gene.lower() + '_mcsm_snps.csv' #(outfile_mcsm_snps, from data_extraction.py)
#infile_snps = outdir + '/' + in_filename_snps
if mutation_filename:
in_filename_snps = mutation_filename
else:
in_filename_snps = gene.lower() + '_mcsm_formatted_snps.csv'
infile_snps = outdir + in_filename_snps
#=======
# output
#=======
# mcsm_results globals
if not result_urls:
result_urls_filename = gene.lower() + '_result_urls.txt'
result_urls = outdir + result_urls_filename
if DEBUG:
print('DEBUG: Result URLs:', result_urls)
if not mcsm_output:
mcsm_output_filename = gene.lower() + '_mcsm_output.csv'
mcsm_output = outdir + mcsm_output_filename
if DEBUG:
print('DEBUG: mCSM output CSV file:', mcsm_output)
# format_results globals
#out_filename_format = gene.lower() + '_mcsm_processed.csv'
if not outfile_format:
out_filename_format = gene.lower() + '_complex_mcsm_norm.csv'
outfile_format = outdir + out_filename_format
if DEBUG:
print('DEBUG: formatted CSV output:', outfile_format)
#%%=====================================================================
def submit_mcsm():
# Example:
# chain = 'A'
# ligand_id = 'RMP'
# affinity = 10
print('Result urls and error file (if any) will be written in: ', outdir)
# call function to format data to remove duplicate snps before submitting job
mcsm_muts = format_data(infile_snps)
mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-)
print('Total SNPs for', gene, ':', infile_snps_len)
for mcsm_mut in mcsm_muts:
print('Processing mutation: %s of %s' % (mut_count, infile_snps_len), mcsm_mut)
if DEBUG:
print('DEBUG: Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, chain, ligand, affinity, prediction_url, outdir, gene)
# function call: to request mcsm prediction
# which writes file containing url for valid submissions and invalid muts to respective files
holding_page = request_calculation(infile_pdb, mcsm_mut, chain, ligand, affinity, prediction_url, outdir, gene, host)
time.sleep(1)
mut_count += 1
# result_url = write_result_url(holding_page, result_urls, host)
print('Request submitted'
, '\nCAUTION: Processing will take at least ten'
, 'minutes, but will be longer for more mutations.')
#%%=====================================================================
def get_results():
output_df = pd.DataFrame()
url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
success_counter = 1
infile_len = os.popen('wc -l < %s' % result_urls).read() # quicker than using Python :-)
print('Total URLs:', infile_len)
with open(result_urls, 'r') as urlfile:
for line in urlfile:
url_line = line.strip()
# call functions
results_interim = scrape_results(url_line)
if results_interim is not None:
print('Processing URL: %s of %s' % (url_counter, infile_len))
result_dict = build_result_dict(results_interim)
df = pd.DataFrame(result_dict, index=[url_counter])
output_df = output_df.append(df)
success_counter += 1
url_counter += 1
print('Total URLs: %s Successful: %s Failed: %s' % (url_counter-1, success_counter-1, (url_counter - success_counter)))
#print('\nOutput file created:', output_dir + gene.lower() + '_mcsm_output.csv')
output_df.to_csv(mcsm_output, index = None, header = True)
#%%=====================================================================
def format_results():
print('Input file:', mcsm_output
, '\n============================================================='
, '\nOutput file:', outfile_format
, '\n=============================================================')
# call function
mcsm_df_formatted = format_mcsm_output(mcsm_output)
# writing file
print('Writing formatted df to csv')
mcsm_df_formatted.to_csv(outfile_format, index = False)
print('Finished writing file:'
, '\nFile:', outfile_format
, '\nExpected no. of rows:', len(mcsm_df_formatted)
, '\nExpected no. of cols:', len(mcsm_df_formatted.columns)
, '\n=============================================================')
#%%=====================================================================
def main():
if stage == 'submit':
print('mCSM stage: submit mutations for mcsm analysis')
submit_mcsm()
elif stage == 'get':
print('mCSM stage: get results')
get_results()
elif stage == 'format':
print('mCSM stage: format results')
format_results()
else:
print('ERROR: invalid stage')
main()

View file

@ -1,512 +0,0 @@
###########################
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#%%%%%%%%%%%%%%%%%%%%%%%%
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
###########################
# Data for bfactor figure
# PS average
# Lig average
###########################
head(my_df$Position)
head(my_df$ratioDUET)
# order data frame
df = my_df[order(my_df$Position),]
head(df$Position)
head(df$ratioDUET)
#***********
# PS: average by position
#***********
mean_DUET_by_position <- df %>%
group_by(Position) %>%
summarize(averaged.DUET = mean(ratioDUET))
#***********
# Lig: average by position
#***********
mean_Lig_by_position <- df %>%
group_by(Position) %>%
summarize(averaged.Lig = mean(ratioPredAff))
#***********
# cbind:mean_DUET_by_position and mean_Lig_by_position
#***********
combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
# sanity check
# mean_PS_Lig_Bfactor
colnames(combined)
colnames(combined) = c("Position"
, "average_DUETR"
, "Position2"
, "average_PredAffR")
colnames(combined)
identical(combined$Position, combined$Position2)
n = which(colnames(combined) == "Position2"); n
combined_df = combined[,-n]
max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
#=============
# output csv
#============
outDir = "~/Data/pyrazinamide/input/processed/"
outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
print(paste0("Output file with path will be:","", outFile))
head(combined_df$Position); tail(combined_df$Position)
write.csv(combined_df, outFile
, row.names = F)
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
#source("barplot_colour_function.R")
require(data.table)
require(dplyr)
########################################################################
# Read file: call script for combining df for PS #
########################################################################
source("../combining_two_df.R")
###########################
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
###########################
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
###########################
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#%%%%%%%%%%%%%%%%%%%%%%%%
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
###########################
# Data for bfactor figure
# PS average
# Lig average
###########################
head(my_df$Position)
head(my_df$ratioDUET)
# order data frame
df = my_df[order(my_df$Position),]
head(df$Position)
head(df$ratioDUET)
#***********
# PS: average by position
#***********
mean_DUET_by_position <- df %>%
group_by(Position) %>%
summarize(averaged.DUET = mean(ratioDUET))
#***********
# Lig: average by position
#***********
mean_Lig_by_position <- df %>%
group_by(Position) %>%
summarize(averaged.Lig = mean(ratioPredAff))
#***********
# cbind:mean_DUET_by_position and mean_Lig_by_position
#***********
combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
# sanity check
# mean_PS_Lig_Bfactor
colnames(combined)
colnames(combined) = c("Position"
, "average_DUETR"
, "Position2"
, "average_PredAffR")
colnames(combined)
identical(combined$Position, combined$Position2)
n = which(colnames(combined) == "Position2"); n
combined_df = combined[,-n]
max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
#=============
# output csv
#============
outDir = "~/git/Data/pyrazinamide/input/processed/"
outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
print(paste0("Output file with path will be:","", outFile))
head(combined_df$Position); tail(combined_df$Position)
write.csv(combined_df, outFile
, row.names = F)
# read in pdb file complex1
inDir = "~/git/Data/pyrazinamide/input/structure"
inFile = paste0(inDir, "complex1_no_water.pdb")
# read in pdb file complex1
inDir = "~/git/Data/pyrazinamide/input/structure/"
inFile = paste0(inDir, "complex1_no_water.pdb")
complex1 = inFile
my_pdb = read.pdb(complex1
, maxlines = -1
, multi = FALSE
, rm.insert = FALSE
, rm.alt = TRUE
, ATOM.only = FALSE
, hex = FALSE
, verbose = TRUE)
#########################
#3: Read complex pdb file
##########################
source("Header_TT.R")
# list of 8
my_pdb = read.pdb(complex1
, maxlines = -1
, multi = FALSE
, rm.insert = FALSE
, rm.alt = TRUE
, ATOM.only = FALSE
, hex = FALSE
, verbose = TRUE)
rm(inDir, inFile)
#====== end of script
inDir = "~/git/Data/pyrazinamide/input/structure/"
inFile = paste0(inDir, "complex1_no_water.pdb")
complex1 = inFile
complex1 = inFile
my_pdb = read.pdb(complex1
, maxlines = -1
, multi = FALSE
, rm.insert = FALSE
, rm.alt = TRUE
, ATOM.only = FALSE
, hex = FALSE
, verbose = TRUE)
inFile
inDir = "~/git/Data/pyrazinamide/input/structure/"
inFile = paste0(inDir, "complex1_no_water.pdb")
complex1 = inFile
#inFile2 = paste0(inDir, "complex2_no_water.pdb")
#complex2 = inFile2
# list of 8
my_pdb = read.pdb(complex1
, maxlines = -1
, multi = FALSE
, rm.insert = FALSE
, rm.alt = TRUE
, ATOM.only = FALSE
, hex = FALSE
, verbose = TRUE)
rm(inDir, inFile, complex1)
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
getwd()
source("Header_TT.R")
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("Header_TT.R")
#########################################################
# TASK: replace B-factors in the pdb file with normalised values
# use the complex file with no water as mCSM lig was
# performed on this file. You can check it in the script: read_pdb file.
#########################################################
###########################
# 2: Read file: average stability values
# or mcsm_normalised file, output of step 4 mcsm pipeline
###########################
inDir = "~/git/Data/pyrazinamide/input/processed/"
inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
my_df <- read.csv(inFile
# , row.names = 1
# , stringsAsFactors = F
, header = T)
str(my_df)
source("read_pdb.R") # list of 8
# extract atom list into a variable
# since in the list this corresponds to data frame, variable will be a df
d = my_pdb[[1]]
# make a copy: required for downstream sanity checks
d2 = d
# sanity checks: B factor
max(d$b); min(d$b)
par(oma = c(3,2,3,0)
, mar = c(1,3,5,2)
, mfrow = c(3,2))
#par(mfrow = c(3,2))
#1: Original B-factor
hist(d$b
, xlab = ""
, main = "B-factor")
plot(density(d$b)
, xlab = ""
, main = "B-factor")
# 2: DUET scores
hist(my_df$average_DUETR
, xlab = ""
, main = "Norm_DUET")
plot(density(my_df$average_DUETR)
, xlab = ""
, main = "Norm_DUET")
# Set the margin on all sides
par(oma = c(3,2,3,0)
, mar = c(1,3,5,2)
, mfrow = c(3,2))
#par(mfrow = c(3,2))
#1: Original B-factor
hist(d$b
, xlab = ""
, main = "B-factor")
plot(density(d$b)
, xlab = ""
, main = "B-factor")
# 2: DUET scores
hist(my_df$average_DUETR
, xlab = ""
, main = "Norm_DUET")
plot(density(my_df$average_DUETR)
, xlab = ""
, main = "Norm_DUET")
#=========
# step 1_P1
#=========
# Be brave and replace in place now (don't run sanity check)
# this makes all the B-factor values in the non-matched positions as NA
d$b = my_df$average_DUETR[match(d$resno, my_df$Position)]
#=========
# step 2_P1
#=========
# count NA in Bfactor
b_na = sum(is.na(d$b)) ; b_na
# count number of 0's in Bactor
sum(d$b == 0)
# replace all NA in b factor with 0
d$b[is.na(d$b)] = 0
# sanity check: should be 0
sum(is.na(d$b))
# sanity check: should be True
if (sum(d$b == 0) == b_na){
print ("Sanity check passed: NA's replaced with 0's successfully")
} else {
print("Error: NA replacement NOT successful, Debug code!")
}
max(d$b); min(d$b)
# sanity checks: should be True
if(max(d$b) == max(my_df$average_DUETR)){
print("Sanity check passed: B-factors replaced correctly")
} else {
print ("Error: Debug code please")
}
if (min(d$b) == min(my_df$average_DUETR)){
print("Sanity check passed: B-factors replaced correctly")
} else {
print ("Error: Debug code please")
}
#=========
# step 3_P1
#=========
# sanity check: dim should be same before reassignment
# should be TRUE
dim(d) == dim(d2)
#=========
# step 4_P1
#=========
# assign it back to the pdb file
my_pdb[[1]] = d
max(d$b); min(d$b)
#=========
# step 5_P1
#=========
# output dir
getwd()
outDir = "~/git/Data/pyrazinamide/output/"
getwd()
outFile = paste0(outDir, "complex1_BwithNormDUET.pdb")
outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
outDir = "~/git/Data/pyrazinamide/input/structure"
outDir = "~/git/Data/pyrazinamide/input/structure/"
outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
write.pdb(my_pdb, outFile)
hist(d$b
, xlab = ""
, main = "repalced-B")
plot(density(d$b)
, xlab = ""
, main = "replaced-B")
# graph titles
mtext(text = "Frequency"
, side = 2
, line = 0
, outer = TRUE)
mtext(text = "DUET_stability"
, side = 3
, line = 0
, outer = TRUE)
#=========================================================
# Processing P2: Replacing B values with PredAff Scores
#=========================================================
# clear workspace
rm(list = ls())
#=========================================================
# Processing P2: Replacing B values with PredAff Scores
#=========================================================
# clear workspace
rm(list = ls())
###########################
# 2: Read file: average stability values
# or mcsm_normalised file, output of step 4 mcsm pipeline
###########################
inDir = "~/git/Data/pyrazinamide/input/processed/"
inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
my_df <- read.csv("../Data/mean_PS_Lig_Bfactor.csv"
# , row.names = 1
# , stringsAsFactors = F
, header = T)
str(my_df)
#=========================================================
# Processing P2: Replacing B factor with mean ratioLig scores
#=========================================================
#########################
# 3: Read complex pdb file
# form the R script
##########################
source("read_pdb.R") # list of 8
# extract atom list into a vari
inDir = "~/git/Data/pyrazinamide/input/processed/"
inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
my_df <- read.csv(inFile
# , row.names = 1
# , stringsAsFactors = F
, header = T)
str(my_df)
# extract atom list into a variable
# since in the list this corresponds to data frame, variable will be a df
d = my_pdb[[1]]
# make a copy: required for downstream sanity checks
d2 = d
# sanity checks: B factor
max(d$b); min(d$b)
par(oma = c(3,2,3,0)
, mar = c(1,3,5,2)
, mfrow = c(3,2))
#par(mfrow = c(3,2))
# 1: Original B-factor
hist(d$b
, xlab = ""
, main = "B-factor")
plot(density(d$b)
, xlab = ""
, main = "B-factor")
# 2: Pred Aff scores
hist(my_df$average_PredAffR
, xlab = ""
, main = "Norm_lig_average")
plot(density(my_df$average_PredAffR)
, xlab = ""
, main = "Norm_lig_average")
# 3: After the following replacement
#********************************
par(oma = c(3,2,3,0)
, mar = c(1,3,5,2)
, mfrow = c(3,2))
#par(mfrow = c(3,2))
# 1: Original B-factor
hist(d$b
, xlab = ""
, main = "B-factor")
plot(density(d$b)
, xlab = ""
, main = "B-factor")
# 2: Pred Aff scores
hist(my_df$average_PredAffR
, xlab = ""
, main = "Norm_lig_average")
plot(density(my_df$average_PredAffR)
, xlab = ""
, main = "Norm_lig_average")
# 3: After the following replacement
#********************************
#=========
# step 1_P2: BE BRAVE and replace in place now (don't run step 0)
#=========
# this makes all the B-factor values in the non-matched positions as NA
d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)]
#=========
# step 2_P2
#=========
# count NA in Bfactor
b_na = sum(is.na(d$b)) ; b_na
# count number of 0's in Bactor
sum(d$b == 0)
# replace all NA in b factor with 0
d$b[is.na(d$b)] = 0
# sanity check: should be 0
sum(is.na(d$b))
if (sum(d$b == 0) == b_na){
print ("Sanity check passed: NA's replaced with 0's successfully")
} else {
print("Error: NA replacement NOT successful, Debug code!")
}
max(d$b); min(d$b)
# sanity checks: should be True
if (max(d$b) == max(my_df$average_PredAffR)){
print("Sanity check passed: B-factors replaced correctly")
} else {
print ("Error: Debug code please")
}
if (min(d$b) == min(my_df$average_PredAffR)){
print("Sanity check passed: B-factors replaced correctly")
} else {
print ("Error: Debug code please")
}
#=========
# step 3_P2
#=========
# sanity check: dim should be same before reassignment
# should be TRUE
dim(d) == dim(d2)
#=========
# step 4_P2
#=========
# assign it back to the pdb file
my_pdb[[1]] = d
max(d$b); min(d$b)
#=========
# step 5_P2
#=========
write.pdb(my_pdb, "Plotting/structure/complex1_BwithNormLIG.pdb")
# output dir
getwd()
# output dir
outDir = "~/git/Data/pyrazinamide/input/structure/"
outFile = paste0(outDir, "complex1_BwithNormLIG.pdb")
outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile
write.pdb(my_pdb, outFile)

View file

@ -1,299 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/")
getwd()
#########################################################
# TASK: To combine mcsm and meta data with af and or
#########################################################
########################################################################
# Installing and loading required packages #
########################################################################
source("Header_TT.R")
#require(data.table)
#require(arsenal)
#require(compare)
#library(tidyverse)
#################################
# Read file: normalised file
# output of step 4 mcsm_pipeline
#################################
inDir = "~/git/Data/pyrazinamide/input/processed/"
inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
mcsm_data = read.csv(inFile
, row.names = 1
, stringsAsFactors = F
, header = T)
rm(inDir, inFile)
str(mcsm_data)
table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
# spelling Correction 1: DUET
mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
# checks: should be the same as above
table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
# spelling Correction 2: Ligand
table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
# checks: should be the same as above
table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
# count na in each column
na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
# sort by Mutationinformation
mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
head(mcsm_data$Mutationinformation)
# get freq count of positions and add to the df
setDT(mcsm_data)[, occurrence := .N, by = .(Position)]
pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence)
###########################
# 2: Read file: meta data with AFandOR
###########################
inDir = "~/git/Data/pyrazinamide/input/processed/"
inFile2 = paste0(inDir, "meta_data_with_AFandOR.csv"); inFile2
meta_with_afor <- read.csv(inFile2
, stringsAsFactors = F
, header = T)
rm(inDir, inFile2)
str(meta_with_afor)
# sort by Mutationinformation
head(meta_with_afor$Mutationinformation)
meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
head(meta_with_afor$Mutationinformation)
# sanity check: should be True for all the mentioned columns
#is.numeric(meta_with_afor$OR)
na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue")
c1 = NULL
for (i in na_var){
print(i)
c0 = is.numeric(meta_with_afor[,i])
c1 = c(c0, c1)
if ( all(c1) ){
print("Sanity check passed: These are all numeric cols")
} else{
print("Error: Please check your respective data types")
}
}
# If OR, and P value are not numeric, then convert to numeric and then count
# else they will say 0
na_count = sapply(meta_with_afor, function(y) sum(length(which(is.na(y))))); na_count
str(na_count)
# compare if the No of "NA" are the same for all these cols
na_len = NULL
for (i in na_var){
temp = na_count[[i]]
na_len = c(na_len, temp)
}
# extract how many NAs:
# should be all TRUE
# should be a single number since
# all the cols should have "equal" and "same" no. of NAs
my_nrows = NULL
for ( i in 1: (length(na_len)-1) ){
#print(compare(na_len[i]), na_len[i+1])
c = compare(na_len[i], na_len[i+1])
if ( c$result ) {
my_nrows = na_len[i] }
else {
print("Error: Please check your numbers")
}
}
my_nrows
#=#=#=#=#=#=#=#=#
# COMMENT: AF, OR, pvalue, logor and neglog10pvalue
# these are the same 7 ones
#=#=#=#=#=#=#=#=#
# sanity check
#which(is.na(meta_with_afor$OR))
# initialise an empty df with nrows as extracted above
na_count_df = data.frame(matrix(vector(mode = 'numeric'
# , length = length(na_var)
)
, nrow = my_nrows
# , ncol = length(na_var)
))
# populate the df with the indices of the cols that are NA
for (i in na_var){
print(i)
na_i = which(is.na(meta_with_afor[i]))
na_count_df = cbind(na_count_df, na_i)
colnames(na_count_df)[which(na_var == i)] <- i
}
# Now compare these indices to ensure these are the same
c2 = NULL
for ( i in 1: ( length(na_count_df)-1 ) ) {
# print(na_count_df[i] == na_count_df[i+1])
c1 = identical(na_count_df[[i]], na_count_df[[i+1]])
c2 = c(c1, c2)
if ( all(c2) ) {
print("Sanity check passed: The indices for AF, OR, etc are all the same")
} else {
print ("Error: Please check indices which are NA")
}
}
rm( c, c0, c1, c2, i, my_nrows
, na_count, na_i, na_len
, na_var, temp
, na_count_df
, pos_count_check )
###########################
# 3:merging two dfs: with NA
###########################
# link col name = Mutationinforamtion
head(mcsm_data$Mutationinformation)
head(meta_with_afor$Mutationinformation)
#########
# merge 1a: meta data with mcsm
#########
merged_df2 = merge(x = meta_with_afor
,y = mcsm_data
, by = "Mutationinformation"
, all.y = T)
head(merged_df2$Position)
# sort by Position
head(merged_df2$Position)
merged_df2 = merged_df2[order(merged_df2$Position),]
head(merged_df2$Position)
merged_df2v2 = merge(x = meta_with_afor
,y = mcsm_data
, by = "Mutationinformation"
, all.x = T)
#!=!=!=!=!=!=!=!
# COMMENT: used all.y since position 186 is not part of the struc,
# hence doesn't have a mcsm value
# but 186 is associated with with mutation
#!=!=!=!=!=!=!=!
# should be False
identical(merged_df2, merged_df2v2)
table(merged_df2$Position%in%merged_df2v2$Position)
rm(merged_df2v2)
#########
# merge 1b:remove duplicate mutation information
#########
#==#=#=#=#=#=#
# Cannot trust lineage, country from this df as the same mutation
# can have many different lineages
# but this should be good for the numerical corr plots
#=#=#=#=#=#=#=
merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),]
head(merged_df3$Position); tail(merged_df3$Position) # should be sorted
# sanity checks
# nrows of merged_df3 should be the same as the nrows of mcsm_data
if(nrow(mcsm_data) == nrow(merged_df3)){
print("sanity check: Passed")
} else {
print("Error!: check data, nrows is not as expected")
}
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# uncomment as necessary
# only need to run this if merged_df2v2 i.e non structural pos included
#mcsm = mcsm_data$Mutationinformation
#my_merged = merged_df3$Mutationinformation
# find the index where it differs
#diff_n = which(!my_merged%in%mcsm)
#check if it is indeed pos 186
#merged_df3[diff_n,]
# remove this entry
#merged_df3 = merged_df3[-diff_n,]]
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
###########################
# 3b :merging two dfs: without NA
###########################
#########
# merge 2a:same as merge 1 but excluding NA
#########
merged_df2_comp = merged_df2[!is.na(merged_df2$AF),]
#########
# merge 2b: remove duplicate mutation information
#########
merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),]
# alternate way of deriving merged_df3_comp
foo = merged_df3[!is.na(merged_df3$AF),]
# compare dfs: foo and merged_df3_com
all.equal(foo, merged_df3)
summary(comparedf(foo, merged_df3))
#=============== end of combining df
#clear variables
rm(mcsm_data
, meta_with_afor
, foo)
#rm(diff_n, my_merged, mcsm)
#=====================
# write_output files
#=====================
# output dir
outDir = "~/git/Data/pyrazinamide/output/"
getwd()
outFile1 = paste0(outDir, "merged_df3.csv"); outFile1
write.csv(merged_df3, outFile1)
#outFile2 = paste0(outDir, "merged_df3_comp.csv"); outFile2
#write.csv(merged_df3_comp, outFile2)
rm(outDir
, outFile1
# , outFile2
)
#============================= end of script

View file

@ -1,348 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/")
getwd()
#########################################################
# TASK: To combine mcsm and meta data with af and or
# by filtering for distance to ligand (<10Ang)
#########################################################
#########################################################
# Installing and loading required packages
#########################################################
#source("Header_TT.R")
#require(data.table)
#require(arsenal)
#require(compare)
#library(tidyverse)
#################################
# Read file: normalised file
# output of step 4 mcsm_pipeline
#################################
inDir = "~/git/Data/pyrazinamide/input/processed/"
inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
mcsm_data = read.csv(inFile
, row.names = 1
, stringsAsFactors = F
, header = T)
rm(inDir, inFile)
str(mcsm_data)
table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
# spelling Correction 1: DUET
mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
# checks
table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
# spelling Correction 2: Ligand
table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
# checks: should be the same as above
table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
########################### !!! only for mcsm_lig
# 4: Filter/subset data
# Lig plots < 10Ang
# Filter the lig plots for Dis_to_lig < 10Ang
###########################
# check range of distances
max(mcsm_data$Dis_lig_Ang)
min(mcsm_data$Dis_lig_Ang)
# count
table(mcsm_data$Dis_lig_Ang<10)
# subset data to have only values less than 10 Ang
mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
# sanity checks
max(mcsm_data2$Dis_lig_Ang)
min(mcsm_data2$Dis_lig_Ang)
# count no of unique positions
length(unique(mcsm_data2$Position))
# count no of unique mutations
length(unique(mcsm_data2$Mutationinformation))
# count Destabilisinga and stabilising
table(mcsm_data2$Lig_outcome) #{RESULT: no of mutations within 10Ang}
#<<<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT: so as not to alter the script
mcsm_data = mcsm_data2
#<<<<<<<<<<<<<<<<<<<<<<<<<<<
#############################
# Extra sanity check:
# for mcsm_lig ONLY
# Dis_lig_Ang should be <10
#############################
if (max(mcsm_data$Dis_lig_Ang) < 10){
print ("Sanity check passed: lig data is <10Ang")
}else{
print ("Error: data should be filtered to be within 10Ang")
}
# clear variables
rm(mcsm_data2)
# count na in each column
na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
head(mcsm_data$Mutationinformation)
mcsm_data[mcsm_data$Mutationinformation=="Q10P",]
mcsm_data[mcsm_data$Mutationinformation=="L4S",]
# sort by Mutationinformation
mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
head(mcsm_data$Mutationinformation)
# check
mcsm_data[grep("Q10P", mcsm_data$Mutationinformation),]
mcsm_data[grep("A102T", mcsm_data$Mutationinformation),]
# get freq count of positions and add to the df
setDT(mcsm_data)[, occurrence := .N, by = .(Position)]
pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence)
###########################
# 2: Read file: meta data with AFandOR
###########################
inDir = "~/git/Data/pyrazinamide/input/processed/"
inFile2 = paste0(inDir, "meta_data_with_AFandOR.csv"); inFile2
meta_with_afor <- read.csv(inFile2
, stringsAsFactors = F
, header = T)
str(meta_with_afor)
# sort by Mutationinformation
head(meta_with_afor$Mutationinformation)
meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
head(meta_with_afor$Mutationinformation)
# sanity check: should be True for all the mentioned columns
#is.numeric(meta_with_afor$OR)
na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue")
c1 = NULL
for (i in na_var){
print(i)
c0 = is.numeric(meta_with_afor[,i])
c1 = c(c0, c1)
if ( all(c1) ){
print("Sanity check passed: These are all numeric cols")
} else{
print("Error: Please check your respective data types")
}
}
# If OR, and P value are not numeric, then convert to numeric and then count
# else they will say 0
# NOW count na in each column: if you did it before, then
# OR and Pvalue column would say 0 na since these were not numeric
na_count = sapply(meta_with_afor, function(y) sum(length(which(is.na(y))))); na_count
str(na_count)
# compare if the No of "NA" are the same for all these cols
na_len = NULL
na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue")
for (i in na_var){
temp = na_count[[i]]
na_len = c(na_len, temp)
}
my_nrows = NULL
for ( i in 1: (length(na_len)-1) ){
#print(compare(na_len[i]), na_len[i+1])
c = compare(na_len[i], na_len[i+1])
if ( c$result ) {
my_nrows = na_len[i] }
else {
print("Error: Please check your numbers")
}
}
my_nrows
#=#=#=#=#=#=#=#=#
# COMMENT: AF, OR, pvalue, logor and neglog10pvalue
# all have 81 NA, with pyrazinamide with 960
# and these are the same 7 ones
#=#=#=#=#=#=#=#=#
# sanity check
#which(is.na(meta_with_afor$OR))
# initialise an empty df with nrows as extracted above
na_count_df = data.frame(matrix(vector(mode = 'numeric'
# , length = length(na_var)
)
, nrow = my_nrows
# , ncol = length(na_var)
))
# populate the df with the indices of the cols that are NA
for (i in na_var){
print(i)
na_i = which(is.na(meta_with_afor[i]))
na_count_df = cbind(na_count_df, na_i)
colnames(na_count_df)[which(na_var == i)] <- i
}
# Now compare these indices to ensure these are the same
c2 = NULL
for ( i in 1: ( length(na_count_df)-1 ) ) {
# print(na_count_df[i] == na_count_df[i+1])
c1 = identical(na_count_df[[i]], na_count_df[[i+1]])
c2 = c(c1, c2)
if ( all(c2) ) {
print("Sanity check passed: The indices for AF, OR, etc are all the same")
} else {
print ("Error: Please check indices which are NA")
}
}
rm( c, c1, c2, i, my_nrows
, na_count, na_i, na_len
, na_var, temp
, na_count_df
, pos_count_check )
###########################
# 3:merging two dfs: with NA
###########################
# link col name = Mutationinforamtion
head(mcsm_data$Mutationinformation)
head(meta_with_afor$Mutationinformation)
#########
# merge 1a: meta data with mcsm
#########
merged_df2 = merge(x = meta_with_afor
, y = mcsm_data
, by = "Mutationinformation"
, all.y = T)
head(merged_df2$Position)
# sort by Position
head(merged_df2$Position)
merged_df2 = merged_df2[order(merged_df2$Position),]
head(merged_df2$Position)
merged_df2v2 = merge(x = meta_with_afor
,y = mcsm_data
, by = "Mutationinformation"
, all.x = T)
#!=!=!=!=!=!=!=!
# COMMENT: used all.y since position 186 is not part of the struc,
# hence doesn't have a mcsm value
# but 186 is associated with with mutation
#!=!=!=!=!=!=!=!
# should be False
identical(merged_df2, merged_df2v2)
table(merged_df2$Position%in%merged_df2v2$Position)
rm(merged_df2v2)
#########
# merge 1b:remove duplicate mutation information
#########
#==#=#=#=#=#=#
# Cannot trust lineage, country from this df as the same mutation
# can have many different lineages
# but this should be good for the numerical corr plots
#=#=#=#=#=#=#=
merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),]
head(merged_df3$Position) ; tail(merged_df3$Position) # should be sorted
# sanity checks
# nrows of merged_df3 should be the same as the nrows of mcsm_data
if(nrow(mcsm_data) == nrow(merged_df3)){
print("sanity check: Passed")
} else {
print("Error!: check data, nrows is not as expected")
}
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# uncomment as necessary
# only need to run this if merged_df2v2 i.e non structural pos included
#mcsm = mcsm_data$Mutationinformation
#my_merged = merged_df3$Mutationinformation
# find the index where it differs
#diff_n = which(!my_merged%in%mcsm)
#check if it is indeed pos 186
#merged_df3[diff_n,]
# remove this entry
#merged_df3 = merged_df3[-diff_n,]
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
###########################
# 3b :merging two dfs: without NA
###########################
#########
# merge 2a:same as merge 1 but excluding NA
#########
merged_df2_comp = merged_df2[!is.na(merged_df2$AF),]
#########
# merge 2b: remove duplicate mutation information
#########
merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),]
# FIXME: add this as a sanity check. I have manually checked!
# alternate way of deriving merged_df3_comp
foo = merged_df3[!is.na(merged_df3$AF),]
# compare dfs: foo and merged_df3_com
all.equal(foo, merged_df3)
summary(comparedf(foo, merged_df3))
#=============== end of combining df
#clear variables
rm(mcsm_data
, meta_with_afor
, foo)
#rm(diff_n, my_merged, mcsm)
#===============end of script
#=====================
# write_output files
#=====================
# Not required as this is a subset of the "combining_two_df.R" script

View file

@ -1,244 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 25 08:46:36 2019
@author: tanushree
"""
############################################
# load libraries
import os
import pandas as pd
from Bio import SeqIO
############################################
#********************************************************************
# TASK: Read in fasta files and create mutant sequences akin to a MSA,
# to allow generation of logo plots
# Requirements:
# input: Fasta file of protein/target for which mut seqs will be created
# path: "Data/<drug>/input/original/<filename>"
# output: MSA for mutant sequences
# path: "Data/<drug>/input/processed/<filename>"
#***********************************************************************
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
############# specify variables for input and output paths and filenames
homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
basedir = "/git/Data/pyrazinamide/input"
# input
inpath = "/original"
in_filename_fasta = "/3pl1.fasta.txt"
infile_fasta = homedir + basedir + inpath + in_filename_fasta
print("Input file is:", infile_fasta)
inpath_p = "/processed"
in_filename_meta_data = "/meta_data_with_AFandOR.csv"
infile_meta_data = homedir + basedir + inpath_p + in_filename_meta_data
print("Input file is:", infile_meta_data)
# output: only path specified, filenames in respective sections
outpath = "/processed"
################## end of variable assignment for input and output files
#==========
#read files
#==========
#############
#fasta file
#############
#my_file = infile_fasta
my_fasta = str()
for seq_record in SeqIO.parse(infile_fasta, "fasta"):
my_seq = seq_record.seq
my_fasta = str(my_seq) #convert to a string
print(my_fasta)
# print( len(my_fasta) )
# print( type(my_fasta) )
len(my_fasta)
#############
# SNP info
#############
# read mutant_info file and extract cols with positions and mutant_info
# This should be all samples with pncA muts
#my_data = pd.read_csv('mcsm_complex1_normalised.csv') #335, 15
#my_data = pd.read_csv('meta_data_with_AFandOR.csv') #3093, 22
my_data = pd.read_csv(infile_meta_data) #3093, 22
list(my_data.columns)
#FIXME: You need a better way to identify this
# remove positions not in the structure
#pos_remove = 186
my_data = my_data[my_data.position != 186] #3092, 22
# if multiple positions, then try the example below;
# https://stackoverflow.com/questions/29017525/deleting-rows-based-on-multiple-conditions-python-pandas
#df = df[(df.one > 0) | (df.two > 0) | (df.three > 0) & (df.four < 1)]
#mut_info1 = my_data[['Position', 'Mutant_type']] #335, 2
mut_info1 = my_data[['position', 'mutant_type']] #3092, 2
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
###############
# data cleaning
################
# extract only those positions that have a frequency count of pos>1
###mut_info['freq_pos'] = mut_info.groupby('Position').count()#### dodgy
# add a column of frequency for each position
#mut_info1['freq_pos'] = mut_info1.groupby('Position')['Position'].transform('count') #335,3
mut_info1['freq_pos'] = mut_info1.groupby('position')['position'].transform('count') #3092,3
# sort by position
mut_info2 = mut_info1.sort_values(by=['position'])
#FIXME
#__main__:1: SettingWithCopyWarning:
#A value is trying to be set on a copy of a slice from a DataFrame.
#Try using .loc[row_indexer,col_indexer] = value instead
#See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
#sort dataframe by freq values so the row indices are in order!
#mut_info2 = mut_info1.sort_values(by = 'freq_pos'
# , axis = 0
# , ascending = False
# , inplace = False
# , na_position = 'last')
#mut_info2 = mut_info2.reset_index( drop = True)
# count how many pos have freq 1 as you will need to exclude those
mut_info2[mut_info2.freq_pos == 1].sum() #20
# extract entries with freq_pos>1
# should be 3093-211 = 3072
mut_info3 = mut_info2.loc[mut_info2['freq_pos'] >1] #3072
# reset index to allow iteration <<<<<<<< IMPORTANT
mut_info = mut_info3.reset_index(drop = True)
del(mut_info1, mut_info2, mut_info3, my_data)
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
###################
# generate mut seqs
###################
mut_seqsL = [] * len(mut_info)
# iterate
for i, pos in enumerate(mut_info['position']):
print('index:', i, 'position:', pos)
mut = mut_info['mutant_type'][i]
# print(mut)
# print( type(mut) )
print('index:', i, 'position:', pos, 'mutant', mut)
my_fastaL = list(my_fasta)
offset_pos = pos-1 #due to counting starting from 0
my_fastaL[offset_pos] = mut
# print(my_fastaL)
mut_seq = "".join(my_fastaL)
# print(mut_seq + '\n')
mut_seqsL.append(mut_seq)
# print('original:', my_fasta, ',', 'replaced at', pos, 'with', mut, mut_seq)
###############
# sanity check
################
len_orig = len(my_fasta)
# checking if all the mutant sequences have the same length as the original fasta file sequence
for seqs in mut_seqsL:
# print(seqs)
# print(len(seqs))
if len(seqs) != len_orig:
print('sequence lengths mismatch' +'\n', 'mutant seq length:', len(seqs), 'vs original seq length:', len_orig)
else:
print('**Hooray** Length of mutant and original sequences match')
del(i, len_orig, mut, mut_seq, my_fastaL, offset_pos, pos, seqs)
############
# write file
############
#filepath = homedir +'/git/LSHTM_Y1_PNCA/combined_v3/logo_plot/snp_seqsfile'
#filepath = homedir + '/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Data/gene_msa.txt'
print(outpath)
out_filename_gene = "/gene_msa.txt"
outfile_gene = homedir + basedir + outpath + out_filename_gene
print("Output file is:", outfile_gene)
with open(outfile_gene, 'w') as file_handler:
for item in mut_seqsL:
file_handler.write("{}\n".format(item))
R="\n".join(mut_seqsL)
f = open('Columns.csv','w')
f.write(R)
f.close()
#################################################################################
# extracting only positions with SNPs so that when you plot only those positions
################################################################################
#mut_seqsL = mut_seqsL[:3] #just trying with 3 seqs
# create a list of unique positions
pos = mut_info['position'] #3072
posL = list(set(list(pos))) #110
del(pos)
snp_seqsL = [] * len(mut_seqsL)
for j, mut_seq in enumerate(mut_seqsL):
print (j, mut_seq)
# print(mut_seq[101]) #testing, this should be P, T V (in order of the mut_info file)
mut_seqsE = list(mut_seq)
# extract specific posistions (corres to SNPs) from list of mutant sequences
snp_seqL1 = [mut_seqsE[i-1] for i in posL] #should be 110
# print(snp_seqL1)
# print(len(snp_seqL1))
snp_seq_clean = "".join(snp_seqL1)
snp_seqsL.append(snp_seq_clean)
###############
# sanity check
################
no_unique_snps = len(posL)
# checking if all the mutant sequences have the same length as the original fasta file sequence
for seqs in snp_seqsL:
# print(seqs)
# print(len(seqs))
if len(seqs) != no_unique_snps:
print('sequence lengths mismatch' +'\n', 'mutant seq length:', len(seqs), 'vs original seq length:', no_unique_snps)
else:
print('**Hooray** Length of mutant and original sequences match')
del(mut_seq, mut_seqsE, mut_seqsL, seqs, snp_seqL1, snp_seq_clean)
############
# write file
############
#filepath = homedir +'/git/LSHTM_Y1_PNCA/combined_v3/logo_plot/snp_seqsfile'
#filepath = homedir + '/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Data/snps_msa.txt'
print(outpath)
out_filename_snps = "/snps_msa.txt"
outfile_snps = homedir + basedir + outpath + out_filename_snps
print("Output file is:", outfile_snps)
with open(outfile_snps, 'w') as file_handler:
for item in snp_seqsL:
file_handler.write("{}\n".format(item))
R="\n".join(snp_seqsL)
f = open('Columns.csv','w')
f.write(R)
f.close()

View file

@ -1,9 +0,0 @@
#!/bin/bash
# run all bash scripts for mcsm
#./step0_check_duplicate_SNPs.sh
#./step1_lig_output_urls.sh
./step2_lig_results.sh
./step3a_results_format_interim.sh

View file

@ -1,25 +0,0 @@
#!/bin/bash
#*************************************
# need to be in the correct directory
#*************************************
##: comments for code
#: commented out code
#**********************************************************************
# TASK: Text file containing a list of SNPs; SNP in the format(C2E)
# per line. Sort by unique, which automatically removes duplicates.
# sace file in current directory
#**********************************************************************
infile="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2.csv"
outfile="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2_unique.csv"
# sort unique entries and output to current directory
sort -u ${infile} > ${outfile}
# count no. of unique snps mCSM will run on
count=$(wc -l < ${outfile})
# print to console no. of unique snps mCSM will run on
echo "${count} unique mutations for mCSM to run on"

View file

@ -1,104 +0,0 @@
#!/bin/bash
#**********************************************************************
# TASK: submit requests using curl: HANDLE redirects and refresh url.
# Iterate over mutation file and write/append result urls to a file
# Mutation file must have one mutation (format A1B) per line
# Requirements
# input: mutation list (format: A1B), complex struc: (pdb format)
# mutation: outFile from step0, one unique mutation/line, no chain ID
# path: "Data/<drug>/input/processed/<filename>"
# structure: pdb file of drug-target complex
# path: "Data/<drug>/input/structure/<filename>"
# output: should be n urls (n=no. of unique mutations in file)
# path: "Data/<drug>/input/processed/<filename>"
# NOTE: these are just result urls, not actual values for results
#**********************************************************************
############# specify variables for input and output paths and filenames
homedir="${HOME}"
#echo Home directory is ${homedir}
basedir="/git/Data/pyrazinamide/input"
# input
inpath_mut="/processed"
in_filename_mut="/pnca_mis_SNPs_v2_unique.csv"
infile_mut="${homedir}${basedir}${inpath_mut}${in_filename_mut}"
echo Input Mut filename: ${infile_mut}
inpath_struc="/structure"
in_filename_struc="/complex1_no_water.pdb"
infile_struc="${homedir}${basedir}${inpath_struc}${in_filename_struc}"
echo Input Struc filename: ${infile_struc}
# output
outpath="/processed"
out_filename="/complex1_result_url.txt"
outfile="${homedir}${basedir}${outpath}${out_filename}"
#echo Output filename: ${outfile}
################## end of variable assignment for input and output files
# iterate over mutation file (infile_mut); line by line and
# submit query using curl
# some useful messages
echo -n -e "Processing $(wc -l < ${infile_mut}) entries from ${infile_mut}\n"
COUNT=0
while read -r line; do
((COUNT++))
mutation="${line}"
# echo "${mutation}"
#pdb='../Data/complex1_no_water.pdb'
pdb="${infile_struc}"
mutation="${mutation}"
chain="A"
lig_id="PZA"
affin_wt="0.99"
host="http://biosig.unimelb.edu.au"
call_url="/mcsm_lig/prediction"
#=========================================
##html field_names names required for curl
##complex_field:wild=@
##mutation_field:mutation=@
##chain_field:chain=@
##ligand_field:lig_id@
##energy_field:affin_wt
#=========================================
refresh_url=$(curl -L \
-sS \
-F "wild=@${pdb}" \
-F "mutation=${mutation}" \
-F "chain=${chain}" \
-F "lig_id=${lig_id}" \
-F "affin_wt=${affin_wt}" \
${host}${call_url} | grep "http-equiv")
#echo Refresh URL: $refresh_url
#echo Host+Refresh: ${host}${refresh_url}
# use regex to extract the relevant bit from the refresh url
# regex:sed -r 's/.*(\/mcsm.*)".*$/\1/g'
# Now build: result url using host and refresh url and write the urls to a file
result_url=$(echo $refresh_url | sed -r 's/.*(\/mcsm.*)".*$/\1/g')
sleep 10
echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${infile_mut})..."
# create output file with the added number of muts from file
# after much thought, bad idea as less generic!
#echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_complex1_result_url.txt
echo -e "${host}${result_url}" >> ${outfile}
#echo -n '.'
done < "${infile_mut}"
#FIXME: stop executing if error else these echo statements are misleading!
echo
echo Output filename: ${outfile}
echo
echo Number of urls saved: $(wc -l < ${infile_mut})
echo
echo "Processing Complete"
# end of submitting query, receiving result url and storing results url in a file

View file

@ -1,76 +0,0 @@
#!/bin/bash
#********************************************************************
# TASK: submit result urls and fetch actual results using curl
# Iterate over each result url from the output of step1 stored in processed/
# Use curl to fetch results and extract relevant sections using hxtools
# and store these in another file in processed/
# Requirements:
# input: output of step1, file containing result urls
# path: "Data/<drug>/input/processed/<filename>"
# output: name of the file where extracted results will be stored
# path: "Data/<drug>/input/processed/<filename>"
# Optional: can make these command line args you pass when calling script
# by uncommenting code as indicated
#*********************************************************************
############################# uncomment: to make it command line args
#if [ "$#" -ne 2 ]; then
#if [ -Z $1 ]; then
# echo "
# Please provide both Input and Output files.
# Usage: batch_read_urls.sh INFILE OUTFILE
# "
# exit 1
#fi
# First argument: Input File
# Second argument: Output File
#infile=$1
#outfile=$2
############################ end of code block to make command line args
############# specify variables for input and output paths and filenames
homedir="${HOME}"
#echo Home directory is ${homedir}
basedir="/git/Data/pyrazinamide/input"
# input
inpath="/processed"
in_filename="/complex1_result_url.txt"
infile="${homedir}${basedir}${inpath}${in_filename}"
echo Input Mut filename: ${infile}
# output
outpath="/processed"
out_filename="/complex1_output_MASTER.txt"
outfile="${homedir}${basedir}${outpath}${out_filename}"
echo Output filename: ${outfile}
################## end of variable assignment for input and output files
# Iterate over each result url, and extract results using hxtools
# which nicely cleans and formats html
echo -n "Processing $(wc -l < ${infile}) entries from ${infile}"
echo
COUNT=0
while read -r line; do
#COUNT=$(($COUNT+1))
((COUNT++))
curl --silent ${line} \
| hxnormalize -x \
| hxselect -c div.span4 \
| hxselect -c div.well \
| sed -r -e 's/<[^>]*>//g' \
| sed -re 's/ +//g' \
>> ${outfile}
#| tee -a ${outfile}
# echo -n '.'
echo -e "Processing entry ${COUNT}/$(wc -l < ${infile})..."
done < "${infile}"
echo
echo "Processing Complete"

View file

@ -1,74 +0,0 @@
#!/bin/bash
#********************************************************************
# TASK: Intermediate results processing
# output file has a convenient delimiter of ":" that can be used to
# format the file into two columns (col1: field_desc and col2: values)
# However the section "PredictedAffinityChange:...." and
# "DUETstabilitychange:.." are split over multiple lines and
# prevent this from happening. Additionally there are other empty lines
# that need to be omiited. In order ensure these sections are not split
# over multiple lines, this script is written.
# Requirements:
# input: output of step2, file containing mcsm results as described above
# path: "Data/<drug>/input/processed/<filename>"
# output: replaces file in place.
# Therefore first create a copy of the input file
# but rename it to remove the word "MASTER" and add the word "processed"
# file format: .txt
# NOTE: This replaces the file in place!
# the output is a txt file with no newlines and formatting
# to have the following format "<colname><:><value>
#***********************************************************************
############# specify variables for input and output paths and filenames
homedir="${HOME}"
basedir="/git/Data/pyrazinamide/input"
inpath="/processed"
# Create input file: copy and rename output file of step2
oldfile="${homedir}${basedir}${inpath}/complex1_output_MASTER.txt"
newfile="${homedir}${basedir}${inpath}/complex1_output_processed.txt"
cp $oldfile $newfile
echo Input filename is ${oldfile}
echo
echo Output i.e copied filename is ${newfile}
# output: No output perse
# Replacement in place inside the copied file
################## end of variable assignment for input and output files
#sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${newfile} \
# | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${newfile}
# Outputs records separated by a newline, that look something like this:
# PredictedAffinityChange:-2.2log(affinityfoldchange)-Destabilizing
# Mutationinformation:
# Wild-type:L
# Position:4
# Mutant-type:W
# Chain:A
# LigandID:PZA
# Distancetoligand:15.911&Aring;
# DUETstabilitychange:-2.169Kcal/mol
#
# PredictedAffinityChange:-1.538log(affinityfoldchange)-Destabilizing
# (...etc)
# This script brings everything in a convenient format for further processing in python.
sed -i '/PredictedAffinityChange/ {
N
N
N
N
s/\n//g
}
/DUETstabilitychange:/ {
N
N
s/\n//g
}
/^$/d' ${newfile}

View file

@ -1,63 +0,0 @@
#!/usr/bin/python
###################
# load libraries
import os, sys
import pandas as pd
from collections import defaultdict
####################
#********************************************************************
# TASK: Formatting results with nice colnames
# step3a processed the mcsm results to remove all newlines and
# brought data in a format where the delimiter ":" splits
# data into a convenient format of "colname": "value".
# this script formats the data and outputs a df with each row
# as a mutation and its corresponding mcsm_values
# Requirements:
# input: output of step3a, file containing "..._output_processed.txt"
# path: "Data/<drug>/input/processed/<filename>"
# output: formatted .csv file
# path: "Data/<drug>/input/processed/<filename>"
#***********************************************************************
############# specify variables for input and output paths and filenames
homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
basedir = "/git/Data/pyrazinamide/input"
# input
inpath = "/processed"
in_filename = "/complex1_output_processed.txt"
infile = homedir + basedir + inpath + in_filename
print("Input file is:", infile)
# output
outpath = "/processed"
out_filename = "/complex1_formatted_results.csv"
outfile = homedir + basedir + outpath + out_filename
print("Output file is:", outfile)
################## end of variable assignment for input and output files
outCols=[
'PredictedAffinityChange',
'Mutationinformation',
'Wild-type',
'Position',
'Mutant-type',
'Chain',
'LigandID',
'Distancetoligand',
'DUETstabilitychange'
]
lines = [line.rstrip('\n') for line in open(infile)]
outputs = defaultdict(list)
for item in lines:
col, val = item.split(':')
outputs[col].append(val)
dfOut=pd.DataFrame(outputs)
pd.DataFrame.to_csv(dfOut, outfile, columns=outCols)

View file

@ -1,230 +0,0 @@
getwd()
#setwd("~/git/LSHTM_analysis/mcsm_complex1/Results")
getwd()
#=======================================================
# TASK: read formatted_results_df.csv to complete
# missing info, adding DUET categories, assigning
# meaningful colnames, etc.
# Requirements:
# input: output of step3b, python processing,
# path: Data/<drug>/input/processed/<filename>"
# output: NO output as the next scripts refers to this
# for yet more processing
#=======================================================
# specify variables for input and output paths and filenames
homedir = "~"
basedir = "/git/Data/pyrazinamide/input"
inpath = "/processed"
in_filename = "/complex1_formatted_results.csv"
infile = paste0(homedir, basedir, inpath, in_filename)
print(paste0("Input file is:", infile))
#======================================================
#TASK: To tidy the columns so you can generate figures
#=======================================================
####################
#### read file #####: this will be the output from python script (csv file)
####################
data = read.csv(infile
, header = T
, stringsAsFactors = FALSE)
dim(data)
str(data)
# clear variables
rm(homedir, basedir, inpath, in_filename, infile)
###########################
##### Data processing #####
###########################
# populate mutation information columns as currently it is empty
head(data$Mutationinformation)
tail(data$Mutationinformation)
# should not be blank: create muation information
data$Mutationinformation = paste0(data$Wild.type, data$Position, data$Mutant.type)
head(data$Mutationinformation)
tail(data$Mutationinformation)
#write.csv(data, 'test.csv')
##########################################
# Remove duplicate SNPs as a sanity check
##########################################
# very important
table(duplicated(data$Mutationinformation))
# extract duplicated entries
dups = data[duplicated(data$Mutationinformation),] #0
# No of dups should match with the no. of TRUE in the above table
#u_dups = unique(dups$Mutationinformation) #10
sum( table(dups$Mutationinformation) )
#***************************************************************
# select non-duplicated SNPs and create a new df
df = data[!duplicated(data$Mutationinformation),]
#***************************************************************
# sanity check
u = unique(df$Mutationinformation)
u2 = unique(data$Mutationinformation)
table(u%in%u2)
# should all be 1
sum(table(df$Mutationinformation) == 1)
# sort df by Position
# MANUAL CHECKPOINT:
#foo <- df[order(df$Position),]
#df <- df[order(df$Position),]
# clear variables
rm(u, u2, dups)
####################
#### give meaningful colnames to reflect units to enable correct data type
####################
#=======
#STEP 1
#========
# make a copy of the PredictedAffinityColumn and call it Lig_outcome
df$Lig_outcome = df$PredictedAffinityChange
#make Predicted...column numeric and outcome column categorical
head(df$PredictedAffinityChange)
df$PredictedAffinityChange = gsub("log.*"
, ""
, df$PredictedAffinityChange)
# sanity checks
head(df$PredictedAffinityChange)
# should be numeric, check and if not make it numeric
is.numeric( df$PredictedAffinityChange )
# change to numeric
df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange)
# should be TRUE
is.numeric( df$PredictedAffinityChange )
# change the column name to indicate units
n = which(colnames(df) == "PredictedAffinityChange"); n
colnames(df)[n] = "PredAffLog"
colnames(df)[n]
#========
#STEP 2
#========
# make Lig_outcome column categorical showing effect of mutation
head(df$Lig_outcome)
df$Lig_outcome = gsub("^.*-"
, "",
df$Lig_outcome)
# sanity checks
head(df$Lig_outcome)
# should be factor, check and if not change it to factor
is.factor(df$Lig_outcome)
# change to factor
df$Lig_outcome = as.factor(df$Lig_outcome)
# should be TRUE
is.factor(df$Lig_outcome)
#========
#STEP 3
#========
# gsub
head(df$Distancetoligand)
df$Distancetoligand = gsub("&Aring;"
, ""
, df$Distancetoligand)
# sanity checks
head(df$Distancetoligand)
# should be numeric, check if not change it to numeric
is.numeric(df$Distancetoligand)
# change to numeric
df$Distancetoligand = as.numeric(df$Distancetoligand)
# should be TRUE
is.numeric(df$Distancetoligand)
# change the column name to indicate units
n = which(colnames(df) == "Distancetoligand")
colnames(df)[n] <- "Dis_lig_Ang"
colnames(df)[n]
#========
#STEP 4
#========
#gsub
head(df$DUETstabilitychange)
df$DUETstabilitychange = gsub("Kcal/mol"
, ""
, df$DUETstabilitychange)
# sanity checks
head(df$DUETstabilitychange)
# should be numeric, check if not change it to numeric
is.numeric(df$DUETstabilitychange)
# change to numeric
df$DUETstabilitychange = as.numeric(df$DUETstabilitychange)
# should be TRUE
is.numeric(df$DUETstabilitychange)
# change the column name to indicate units
n = which(colnames(df) == "DUETstabilitychange"); n
colnames(df)[n] = "DUETStability_Kcalpermol"
colnames(df)[n]
#========
#STEP 5
#========
# create yet another extra column: classification of DUET stability only
df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0
, "Stabilizing"
, "Destabilizing") # spelling to be consistent with mcsm
table(df$Lig_outcome)
table(df$DUET_outcome)
#==============================
#FIXME
#Insert a venn diagram
#================================
#========
#STEP 6
#========
# assign wild and mutant colnames correctly
wt = which(colnames(df) == "Wild.type"); wt
colnames(df)[wt] <- "Wild_type"
colnames(df[wt])
mut = which(colnames(df) == "Mutant.type"); mut
colnames(df)[mut] <- "Mutant_type"
colnames(df[mut])
#========
#STEP 7
#========
# create an extra column: maybe useful for some plots
df$WildPos = paste0(df$Wild_type, df$Position)
# clear variables
rm(n, wt, mut)
################ end of data cleaning

View file

@ -1,275 +0,0 @@
##################
# load libraries
library(compare)
##################
getwd()
#=======================================================
# TASK:read cleaned data and perform rescaling
# of DUET stability scores
# of Pred affinity
# compare scaling methods with plots
# Requirements:
# input: R script, step3c_results_cleaning.R
# path: Data/<drug>/input/processed/<filename>"
# output: NO output as the next scripts refers to this
# for yet more processing
# output normalised file
#=======================================================
# specify variables for input and output paths and filenames
homedir = "~"
currdir = "/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/mcsm"
in_filename = "/step3c_results_cleaning.R"
infile = paste0(homedir, currdir, in_filename)
print(paste0("Input file is:", infile))
# output file
basedir = "/git/Data/pyrazinamide/input"
outpath = "/processed"
out_filename = "/mcsm_complex1_normalised.csv"
outfile = paste0(homedir, basedir, outpath, out_filename)
print(paste0("Output file is:", outfile))
####################
#### read file #####: this will be the output of my R script that cleans the data columns
####################
source(infile)
#This will outut two dataframes:
# data: unclean data: 10 cols
# df : cleaned df: 13 cols
# you can remove data if you want as you will not need it
rm(data)
colnames(df)
#===================
#3a: PredAffLog
#===================
n = which(colnames(df) == "PredAffLog"); n
group = which(colnames(df) == "Lig_outcome"); group
#===================================================
# order according to PredAffLog values
#===================================================
# This is because this makes it easier to see the results of rescaling for debugging
head(df$PredAffLog)
# ORDER BY PredAff scrores: negative values at the top and positive at the bottoom
df = df[order(df$PredAffLog),]
head(df$PredAffLog)
# sanity checks
head(df[,n]) # all negatives
tail(df[,n]) # all positives
# sanity checks
mean(df[,n])
#-0.9526746
tapply(df[,n], df[,group], mean)
#===========================
# Same as above: in 2 steps
#===========================
# find range of your data
my_min = min(df[,n]); my_min #
my_max = max(df[,n]); my_max #
#===============================================
# WITHIN GROUP rescaling 2: method "ratio"
# create column to store the rescaled values
# Rescaling separately (Less dangerous)
# =====> chosen one: preserves sign
#===============================================
df$ratioPredAff = ifelse(df[,n] < 0
, df[,n]/abs(my_min)
, df[,n]/my_max
)# 14 cols
# sanity checks
head(df$ratioPredAff)
tail(df$ratioPredAff)
min(df$ratioPredAff); max(df$ratioPredAff)
tapply(df$ratioPredAff, df$Lig_outcome, min)
tapply(df$ratioPredAff, df$Lig_outcome, max)
# should be the same as below
sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0)
table(df$Lig_outcome)
#===============================================
# Hist and density plots to compare the rescaling
# methods: Base R
#===============================================
# uncomment as necessary
my_title = "Ligand_stability"
# my_title = colnames(df[n])
# Set the margin on all sides
par(oma = c(3,2,3,0)
, mar = c(1,3,5,2)
, mfrow = c(2,2))
hist(df[,n]
, xlab = ""
, main = "Raw values"
)
hist(df$ratioPredAff
, xlab = ""
, main = "ratio rescaling"
)
# Plot density plots underneath
plot(density( df[,n] )
, main = "Raw values"
)
plot(density( df$ratioPredAff )
, main = "ratio rescaling"
)
# titles
mtext(text = "Frequency"
, side = 2
, line = 0
, outer = TRUE)
mtext(text = my_title
, side = 3
, line = 0
, outer = TRUE)
#clear variables
rm(my_min, my_max, my_title, n, group)
#===================
# 3b: DUET stability
#===================
dim(df) # 14 cols
n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10
group = which(colnames(df) == "DUET_outcome"); group #12
#===================================================
# order according to DUET scores
#===================================================
# This is because this makes it easier to see the results of rescaling for debugging
head(df$DUETStability_Kcalpermol)
# ORDER BY DUET scores: negative values at the top and positive at the bottom
df = df[order(df$DUETStability_Kcalpermol),]
# sanity checks
head(df[,n]) # negatives
tail(df[,n]) # positives
# sanity checks
mean(df[,n])
tapply(df[,n], df[,group], mean)
#===============================================
# WITHIN GROUP rescaling 2: method "ratio"
# create column to store the rescaled values
# Rescaling separately (Less dangerous)
# =====> chosen one: preserves sign
#===============================================
# find range of your data
my_min = min(df[,n]); my_min
my_max = max(df[,n]); my_max
df$ratioDUET = ifelse(df[,n] < 0
, df[,n]/abs(my_min)
, df[,n]/my_max
) # 15 cols
# sanity check
head(df$ratioDUET)
tail(df$ratioDUET)
min(df$ratioDUET); max(df$ratioDUET)
# sanity checks
tapply(df$ratioDUET, df$DUET_outcome, min)
tapply(df$ratioDUET, df$DUET_outcome, max)
# should be the same as below (267 and 42)
sum(df$ratioDUET < 0); sum(df$ratioDUET > 0)
table(df$DUET_outcome)
#===============================================
# Hist and density plots to compare the rescaling
# methods: Base R
#===============================================
# uncomment as necessary
my_title = "DUET_stability"
#my_title = colnames(df[n])
# Set the margin on all sides
par(oma = c(3,2,3,0)
, mar = c(1,3,5,2)
, mfrow = c(2,2))
hist(df[,n]
, xlab = ""
, main = "Raw values"
)
hist(df$ratioDUET
, xlab = ""
, main = "ratio rescaling"
)
# Plot density plots underneath
plot(density( df[,n] )
, main = "Raw values"
)
plot(density( df$ratioDUET )
, main = "ratio rescaling"
)
# graph titles
mtext(text = "Frequency"
, side = 2
, line = 0
, outer = TRUE)
mtext(text = my_title
, side = 3
, line = 0
, outer = TRUE)
# reorder by column name
#data <- data[c("A", "B", "C")]
colnames(df)
df2 = df[c("X", "Mutationinformation", "WildPos", "Position"
, "Wild_type", "Mutant_type"
, "DUETStability_Kcalpermol", "DUET_outcome"
, "Dis_lig_Ang", "PredAffLog", "Lig_outcome"
, "ratioDUET", "ratioPredAff"
, "LigandID","Chain")]
# sanity check
# should be True
#compare(df, df2, allowAll = T)
compare(df, df2, ignoreColOrder = T)
#TRUE
#reordered columns
#===================
# write output as csv file
#===================
#write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE)
write.csv(df2, outfile, row.names = FALSE)

View file

@ -1,131 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
#source("barplot_colour_function.R")
require(data.table)
require(dplyr)
########################################################################
# Read file: call script for combining df for PS #
########################################################################
source("../combining_two_df.R")
###########################
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
###########################
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
###########################
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#%%%%%%%%%%%%%%%%%%%%%%%%
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
###########################
# Data for bfactor figure
# PS average
# Lig average
###########################
head(my_df$Position)
head(my_df$ratioDUET)
# order data frame
df = my_df[order(my_df$Position),]
head(df$Position)
head(df$ratioDUET)
#***********
# PS: average by position
#***********
mean_DUET_by_position <- df %>%
group_by(Position) %>%
summarize(averaged.DUET = mean(ratioDUET))
#***********
# Lig: average by position
#***********
mean_Lig_by_position <- df %>%
group_by(Position) %>%
summarize(averaged.Lig = mean(ratioPredAff))
#***********
# cbind:mean_DUET_by_position and mean_Lig_by_position
#***********
combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
# sanity check
# mean_PS_Lig_Bfactor
colnames(combined)
colnames(combined) = c("Position"
, "average_DUETR"
, "Position2"
, "average_PredAffR")
colnames(combined)
identical(combined$Position, combined$Position2)
n = which(colnames(combined) == "Position2"); n
combined_df = combined[,-n]
max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
#=============
# output csv
#============
outDir = "~/git/Data/pyrazinamide/input/processed/"
outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
print(paste0("Output file with path will be:","", outFile))
head(combined_df$Position); tail(combined_df$Position)
write.csv(combined_df, outFile
, row.names = F)

View file

@ -1,250 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
#source("barplot_colour_function.R")
require(cowplot)
########################################################################
# Read file: call script for combining df for PS #
########################################################################
source("../combining_two_df.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for OR and stability plots
# you need merged_df3_comp
# since these are matched
# to allow pairwise corr
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df3_comp
#my_df = merged_df3
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# sanity check
# Ensure correct data type in columns to plot: need to be factor
is.numeric(my_df$OR)
#[1] TRUE
#<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
# FOR PS Plots
#<<<<<<<<<<<<<<<<<<<
PS_df = my_df
rm(my_df)
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1
########################################################################
# Read file: call script for combining df for lig #
########################################################################
getwd()
source("combining_two_df_lig.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for OR and stability plots
# you need merged_df3_comp
# since these are matched
# to allow pairwise corr
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df2 = merged_df3_comp
#my_df2 = merged_df3
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df2)
str(my_df2)
# sanity check
# Ensure correct data type in columns to plot: need to be factor
is.numeric(my_df2$OR)
#[1] TRUE
# sanity check: should be <10
if (max(my_df2$Dis_lig_Ang) < 10){
print ("Sanity check passed: lig data is <10Ang")
}else{
print ("Error: data should be filtered to be within 10Ang")
}
#<<<<<<<<<<<<<<<<
# REASSIGNMENT
# FOR Lig Plots
#<<<<<<<<<<<<<<<<
Lig_df = my_df2
rm(my_df2)
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1
#############
# Plots: Bubble plot
# x = Position, Y = stability
# size of dots = OR
# col: stability
#############
#=================
# generate plot 1: DUET vs OR by position as geom_points
#=================
my_ats = 20 # axis text size
my_als = 22 # axis label size
# Spelling Correction: made redundant as already corrected at the source
#PS_df$DUET_outcome[PS_df$DUET_outcome=='Stabilizing'] <- 'Stabilising'
#PS_df$DUET_outcome[PS_df$DUET_outcome=='Destabilizing'] <- 'Destabilising'
table(PS_df$DUET_outcome) ; sum(table(PS_df$DUET_outcome))
g = ggplot(PS_df, aes(x = factor(Position)
, y = ratioDUET))
p1 = g +
geom_point(aes(col = DUET_outcome
, size = OR)) +
theme(axis.text.x = element_text(size = my_ats
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = my_ats
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_als)
, axis.title.y = element_text(size = my_als)
, legend.text = element_text(size = my_als)
, legend.title = element_text(size = my_als) ) +
#, legend.key.size = unit(1, "cm")) +
labs(title = ""
, x = "Position"
, y = "DUET(PS)"
, size = "Odds Ratio"
, colour = "DUET Outcome") +
guides(colour = guide_legend(override.aes = list(size=4)))
p1
#=================
# generate plot 2: Lig vs OR by position as geom_points
#=================
my_ats = 20 # axis text size
my_als = 22 # axis label size
# Spelling Correction: made redundant as already corrected at the source
#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Stabilizing'] <- 'Stabilising'
#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Destabilizing'] <- 'Destabilising'
table(Lig_df$Lig_outcome)
g = ggplot(Lig_df, aes(x = factor(Position)
, y = ratioPredAff))
p2 = g +
geom_point(aes(col = Lig_outcome
, size = OR))+
theme(axis.text.x = element_text(size = my_ats
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = my_ats
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_als)
, axis.title.y = element_text(size = my_als)
, legend.text = element_text(size = my_als)
, legend.title = element_text(size = my_als) ) +
#, legend.key.size = unit(1, "cm")) +
labs(title = ""
, x = "Position"
, y = "Ligand Affinity"
, size = "Odds Ratio"
, colour = "Ligand Outcome"
) +
guides(colour = guide_legend(override.aes = list(size=4)))
p2
#======================
#combine using cowplot
#======================
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots"
getwd()
svg('PS_Lig_OR_combined.svg', width = 32, height = 12) #inches
#png('PS_Lig_OR_combined.png', width = 2800, height = 1080) #300dpi
theme_set(theme_gray()) # to preserve default theme
printFile = cowplot::plot_grid(plot_grid(p1, p2
, ncol = 1
, align = 'v'
, labels = c("A", "B")
, label_size = my_als+5))
print(printFile)
dev.off()

View file

@ -1,154 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
########################################################################
# Read file: call script for combining df for lig #
########################################################################
source("../combining_two_df_lig.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for Lig plots
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#%%%%%%%%%%%%%%%%%%%%%%%%
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
#############################
# Extra sanity check:
# for mcsm_lig ONLY
# Dis_lig_Ang should be <10
#############################
if (max(my_df$Dis_lig_Ang) < 10){
print ("Sanity check passed: lig data is <10Ang")
}else{
print ("Error: data should be filtered to be within 10Ang")
}
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#==========================
# Plot: Barplot with scores (unordered)
# corresponds to Lig_outcome
# Stacked Barplot with colours: Lig_outcome @ position coloured by
# Lig_outcome. This is a barplot where each bar corresponds
# to a SNP and is coloured by its corresponding Lig_outcome.
#============================
#===================
# Data for plots
#===================
#%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
df = my_df
#%%%%%%%%%%%%%%%%%%%%%%%%
rm(my_df)
# sanity checks
upos = unique(my_df$Position)
# should be a factor
is.factor(df$Lig_outcome)
#TRUE
table(df$Lig_outcome)
# should be -1 and 1: may not be in this case because you have filtered the data
# FIXME: normalisation before or after filtering?
min(df$ratioPredAff) #
max(df$ratioPredAff) #
# sanity checks
tapply(df$ratioPredAff, df$Lig_outcome, min)
tapply(df$ratioPredAff, df$Lig_outcome, max)
#******************
# generate plot
#******************
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
my_title = "Ligand affinity"
# axis label size
my_xaxls = 13
my_yaxls = 15
# axes text size
my_xaxts = 15
my_yaxts = 15
# no ordering of x-axis
g = ggplot(df, aes(factor(Position, ordered = T)))
g +
geom_bar(aes(fill = Lig_outcome), colour = "grey") +
theme( axis.text.x = element_text(size = my_xaxls
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = my_yaxls
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_xaxts)
, axis.title.y = element_text(size = my_yaxts ) ) +
labs(title = my_title
, x = "Position"
, y = "Frequency")
# for sanity and good practice
rm(df)
#======================= end of plot
# axis colours labels
# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label

View file

@ -1,149 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages and functions #
########################################################################
source("../Header_TT.R")
########################################################################
# Read file: call script for combining df for PS #
########################################################################
source("../combining_two_df.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#==========================
###########################
# Data for DUET plots
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
# sanity check
is.factor(my_df$DUET_outcome)
my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
is.factor(my_df$DUET_outcome)
#[1] TRUE
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#==========================
# Plot 2: Barplot with scores (unordered)
# corresponds to DUET_outcome
# Stacked Barplot with colours: DUET_outcome @ position coloured by
# DUET outcome. This is a barplot where each bar corresponds
# to a SNP and is coloured by its corresponding DUET_outcome
#============================
#===================
# Data for plots
#===================
#<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df = my_df
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(my_df)
# sanity checks
upos = unique(df$Position)
# should be a factor
is.factor(my_df$DUET_outcome)
#[1] TRUE
table(my_df$DUET_outcome)
# should be -1 and 1
min(df$ratioDUET)
max(df$ratioDUET)
tapply(df$ratioDUET, df$DUET_outcome, min)
tapply(df$ratioDUET, df$DUET_outcome, max)
#******************
# generate plot
#******************
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
my_title = "Protein stability (DUET)"
# axis label size
my_xaxls = 13
my_yaxls = 15
# axes text size
my_xaxts = 15
my_yaxts = 15
# no ordering of x-axis
g = ggplot(df, aes(factor(Position, ordered = T)))
g +
geom_bar(aes(fill = DUET_outcome), colour = "grey") +
theme( axis.text.x = element_text(size = my_xaxls
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = my_yaxls
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_xaxts)
, axis.title.y = element_text(size = my_yaxts ) ) +
labs(title = my_title
, x = "Position"
, y = "Frequency")
# for sanity and good practice
rm(df)
#======================= end of plot
# axis colours labels
# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label

View file

@ -1,202 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages and functions #
########################################################################
source("../Header_TT.R")
source("../barplot_colour_function.R")
########################################################################
# Read file: call script for combining df for lig #
########################################################################
source("../combining_two_df_lig.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for Lig plots
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
# sanity check
is.factor(my_df$Lig_outcome)
my_df$Lig_outcome = as.factor(my_df$Ligoutcome)
is.factor(my_df$Lig_outcome)
#[1] TRUE
#############################
# Extra sanity check:
# for mcsm_lig ONLY
# Dis_lig_Ang should be <10
#############################
if (max(my_df$Dis_lig_Ang) < 10){
print ("Sanity check passed: lig data is <10Ang")
}else{
print ("Error: data should be filtered to be within 10Ang")
}
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#==========================
# Plot: Barplot with scores (unordered)
# corresponds to Lig_outcome
# Stacked Barplot with colours: Lig_outcome @ position coloured by
# stability scores. This is a barplot where each bar corresponds
# to a SNP and is coloured by its corresponding Lig stability value.
# Normalised values (range between -1 and 1 ) to aid visualisation
# NOTE: since barplot plots discrete values, colour = score, so number of
# colours will be equal to the no. of unique normalised scores
# rather than a continuous scale
# will require generating the colour scale separately.
#============================
#===================
# Data for plots
#===================
#<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df = my_df
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(my_df)
# sanity checks
table(df$Lig_outcome)
# should be -1 and 1: may not be in this case because you have filtered the data
# FIXME: normalisation before or after filtering?
min(df$ratioPredAff) #
max(df$ratioPredAff) #
# sanity checks
# very important!!!!
tapply(df$ratioPredAff, df$Lig_outcome, min)
tapply(df$ratioPredAff, df$Lig_outcome, max)
#******************
# generate plot
#******************
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
# My colour FUNCTION: based on group and subgroup
# in my case;
# df = df
# group = Lig_outcome
# subgroup = normalised score i.e ratioPredAff
# Prepare data: round off ratioLig scores
# round off to 3 significant digits:
# 165 if no rounding is performed: used to generate the originalgraph
# 156 if rounded to 3 places
# FIXME: check if reducing precision creates any ML prob
# check unique values in normalised data
u = unique(df$ratioPredAff)
# <<<<< -------------------------------------------
# Run this section if rounding is to be used
# specify number for rounding
n = 3
df$ratioLigR = round(df$ratioPredAff, n)
u = unique(df$ratioLigR) # 156
# create an extra column called group which contains the "gp name and score"
# so colours can be generated for each unique values in this column
my_grp = df$ratioLigR
df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
# else
# uncomment the below if rounding is not required
#my_grp = df$ratioLig
#df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
# <<<<< -----------------------------------------------
# Call the function to create the palette based on the group defined above
colours <- ColourPalleteMulti(df, "Lig_outcome", "my_grp")
my_title = "Ligand affinity"
# axis label size
my_xaxls = 13
my_yaxls = 15
# axes text size
my_xaxts = 15
my_yaxts = 15
# no ordering of x-axis
g = ggplot(df, aes(factor(Position, ordered = T)))
g +
geom_bar(aes(fill = group), colour = "grey") +
scale_fill_manual( values = colours
, guide = 'none') +
theme( axis.text.x = element_text(size = my_xaxls
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = my_yaxls
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_xaxts)
, axis.title.y = element_text(size = my_yaxts ) ) +
labs(title = my_title
, x = "Position"
, y = "Frequency")
# for sanity and good practice
rm(df)
#======================= end of plot
# axis colours labels
# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label

View file

@ -1,192 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages and functions #
########################################################################
source("../Header_TT.R")
source("../barplot_colour_function.R")
########################################################################
# Read file: call script for combining df for PS #
########################################################################
source("../combining_two_df.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for DUET plots
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
# sanity check
is.factor(my_df$DUET_outcome)
my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
is.factor(my_df$DUET_outcome)
#[1] TRUE
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#==========================
# Barplot with scores (unordered)
# corresponds to DUET_outcome
# Stacked Barplot with colours: DUET_outcome @ position coloured by
# stability scores. This is a barplot where each bar corresponds
# to a SNP and is coloured by its corresponding DUET stability value.
# Normalised values (range between -1 and 1 ) to aid visualisation
# NOTE: since barplot plots discrete values, colour = score, so number of
# colours will be equal to the no. of unique normalised scores
# rather than a continuous scale
# will require generating the colour scale separately.
#============================
#===================
# Data for plots
#===================
#<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df = my_df
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(my_df)
# sanity checks
upos = unique(df$Position)
# should be a factor
is.factor(my_df$DUET_outcome)
#[1] TRUE
table(df$DUET_outcome)
# should be -1 and 1
min(df$ratioDUET)
max(df$ratioDUET)
tapply(df$ratioDUET, df$DUET_outcome, min)
tapply(df$ratioDUET, df$DUET_outcome, max)
#******************
# generate plot
#******************
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
# My colour FUNCTION: based on group and subgroup
# in my case;
# df = df
# group = DUET_outcome
# subgroup = normalised score i.e ratioDUET
# Prepare data: round off ratioDUET scores
# round off to 3 significant digits:
# 323 if no rounding is performed: used to generate the original graph
# 287 if rounded to 3 places
# FIXME: check if reducing precicion creates any ML prob
# check unique values in normalised data
u = unique(df$ratioDUET)
# <<<<< -------------------------------------------
# Run this section if rounding is to be used
# specify number for rounding
n = 3
df$ratioDUETR = round(df$ratioDUET, n)
u = unique(df$ratioDUETR)
# create an extra column called group which contains the "gp name and score"
# so colours can be generated for each unique values in this column
my_grp = df$ratioDUETR
df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
# else
# uncomment the below if rounding is not required
#my_grp = df$ratioDUET
#df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
# <<<<< -----------------------------------------------
# Call the function to create the palette based on the group defined above
colours <- ColourPalleteMulti(df, "DUET_outcome", "my_grp")
my_title = "Protein stability (DUET)"
# axis label size
my_xaxls = 13
my_yaxls = 15
# axes text size
my_xaxts = 15
my_yaxts = 15
# no ordering of x-axis
g = ggplot(df, aes(factor(Position, ordered = T)))
g +
geom_bar(aes(fill = group), colour = "grey") +
scale_fill_manual( values = colours
, guide = 'none') +
theme( axis.text.x = element_text(size = my_xaxls
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = my_yaxls
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_xaxts)
, axis.title.y = element_text(size = my_yaxts ) ) +
labs(title = my_title
, x = "Position"
, y = "Frequency")
# for sanity and good practice
rm(df)
#======================= end of plot
# axis colours labels
# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label

View file

@ -1,215 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
#require(data.table)
#require(dplyr)
########################################################################
# Read file: call script for combining df for lig #
########################################################################
source("../combining_two_df_lig.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for Lig plots
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
# sanity check
is.factor(my_df$Lig_outcome)
my_df$Lig_outcome = as.factor(my_df$lig_outcome)
is.factor(my_df$Lig_outcome)
#[1] TRUE
#############################
# Extra sanity check:
# for mcsm_lig ONLY
# Dis_lig_Ang should be <10
#############################
if (max(my_df$Dis_lig_Ang) < 10){
print ("Sanity check passed: lig data is <10Ang")
}else{
print ("Error: data should be filtered to be within 10Ang")
}
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#===========================
# Plot: Basic barplots
#===========================
#===================
# Data for plots
#===================
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df = my_df
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(my_df)
# sanity checks
str(df)
if (identical(df$Position, df$position)){
print("Sanity check passed: Columns 'Position' and 'position' are identical")
} else{
print("Error!: Check column names and info contained")
}
#****************
# generate plot: No of stabilising and destabilsing muts
#****************
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
svg('basic_barplots_LIG.svg')
my_ats = 25 # axis text size
my_als = 22 # axis label size
# uncomment as necessary for either directly outputting results or
# printing on the screen
g = ggplot(df, aes(x = Lig_outcome))
#prinfFile = g + geom_bar(
g + geom_bar(
aes(fill = Lig_outcome)
, show.legend = TRUE
) + geom_label(
stat = "count"
, aes(label = ..count..)
, color = "black"
, show.legend = FALSE
, size = 10) + theme(
axis.text.x = element_blank()
, axis.title.x = element_blank()
, axis.title.y = element_text(size=my_als)
, axis.text.y = element_text(size = my_ats)
, legend.position = c(0.73,0.8)
, legend.text = element_text(size=my_als-2)
, legend.title = element_text(size=my_als)
, plot.title = element_blank()
) + labs(
title = ""
, y = "Number of SNPs"
#, fill='Ligand Outcome'
) + scale_fill_discrete(name = "Ligand Outcome"
, labels = c("Destabilising", "Stabilising"))
print(prinfFile)
dev.off()
#****************
# generate plot: No of positions
#****************
#get freq count of positions so you can subset freq<1
#require(data.table)
setDT(df)[, pos_count := .N, by = .(Position)] #169, 36
head(df$pos_count)
table(df$pos_count)
# this is cummulative
#1 2 3 4 5 6
#5 24 36 56 30 18
# use group by on this
snpsBYpos_df <- df %>%
group_by(Position) %>%
summarize(snpsBYpos = mean(pos_count))
table(snpsBYpos_df$snpsBYpos)
#1 2 3 4 5 6
#5 12 12 14 6 3
# this is what will get plotted
svg('position_count_LIG.svg')
my_ats = 25 # axis text size
my_als = 22 # axis label size
g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
prinfFile = g + geom_bar(
#g + geom_bar(
aes (alpha = 0.5)
, show.legend = FALSE
) +
geom_label(
stat = "count", aes(label = ..count..)
, color = "black"
, size = 10
) +
theme(
axis.text.x = element_text(
size = my_ats
, angle = 0
)
, axis.text.y = element_text(
size = my_ats
, angle = 0
, hjust = 1
)
, axis.title.x = element_text(size = my_als)
, axis.title.y = element_text(size = my_als)
, plot.title = element_blank()
) +
labs(
x = "Number of SNPs"
, y = "Number of Sites"
)
print(prinfFile)
dev.off()
########################################################################
# end of Lig barplots #
########################################################################

View file

@ -1,211 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages and functions #
########################################################################
source("../Header_TT.R")
########################################################################
# Read file: call script for combining df for PS #
########################################################################
source("../combining_two_df.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#==========================
###########################
# Data for DUET plots
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
# sanity check
is.factor(my_df$DUET_outcome)
my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
is.factor(my_df$DUET_outcome)
#[1] TRUE
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#===========================
# Plot: Basic barplots
#===========================
#===================
# Data for plots
#===================
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df = my_df
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(my_df)
# sanity checks
str(df)
if (identical(df$Position, df$position)){
print("Sanity check passed: Columns 'Position' and 'position' are identical")
} else{
print("Error!: Check column names and info contained")
}
#****************
# generate plot: No of stabilising and destabilsing muts
#****************
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
svg('basic_barplots_DUET.svg')
my_ats = 25 # axis text size
my_als = 22 # axis label size
theme_set(theme_grey())
# uncomment as necessary for either directly outputting results or
# printing on the screen
g = ggplot(df, aes(x = DUET_outcome))
prinfFile = g + geom_bar(
#g + geom_bar(
aes(fill = DUET_outcome)
, show.legend = TRUE
) + geom_label(
stat = "count"
, aes(label = ..count..)
, color = "black"
, show.legend = FALSE
, size = 10) + theme(
axis.text.x = element_blank()
, axis.title.x = element_blank()
, axis.title.y = element_text(size=my_als)
, axis.text.y = element_text(size = my_ats)
, legend.position = c(0.73,0.8)
, legend.text = element_text(size=my_als-2)
, legend.title = element_text(size=my_als)
, plot.title = element_blank()
) + labs(
title = ""
, y = "Number of SNPs"
#, fill='DUET Outcome'
) + scale_fill_discrete(name = "DUET Outcome"
, labels = c("Destabilising", "Stabilising"))
print(prinfFile)
dev.off()
#****************
# generate plot: No of positions
#****************
#get freq count of positions so you can subset freq<1
#setDT(df)[, .(Freq := .N), by = .(Position)] #189, 36
setDT(df)[, pos_count := .N, by = .(Position)] #335, 36
table(df$pos_count)
# this is cummulative
#1 2 3 4 5 6
#34 76 63 104 40 18
# use group by on this
snpsBYpos_df <- df %>%
group_by(Position) %>%
summarize(snpsBYpos = mean(pos_count))
table(snpsBYpos_df$snpsBYpos)
#1 2 3 4 5 6
#34 38 21 26 8 3
foo = select(df, Mutationinformation
, WildPos
, wild_type
, mutant_type
, mutation_info
, position
, pos_count) #335, 5
getwd()
write.csv(foo, "../Data/pos_count_freq.csv")
svg('position_count_DUET.svg')
my_ats = 25 # axis text size
my_als = 22 # axis label size
g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
prinfFile = g + geom_bar(
#g + geom_bar(
aes (alpha = 0.5)
, show.legend = FALSE
) +
geom_label(
stat = "count", aes(label = ..count..)
, color = "black"
, size = 10
) +
theme(
axis.text.x = element_text(
size = my_ats
, angle = 0
)
, axis.text.y = element_text(
size = my_ats
, angle = 0
, hjust = 1
)
, axis.title.x = element_text(size = my_als)
, axis.title.y = element_text(size = my_als)
, plot.title = element_blank()
) +
labs(
x = "Number of SNPs"
, y = "Number of Sites"
)
print(prinfFile)
dev.off()
########################################################################
# end of DUET barplots #
########################################################################

View file

@ -1,175 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages and functions #
########################################################################
source("../Header_TT.R")
#source("barplot_colour_function.R")
########################################################################
# Read file: call script for combining df for PS #
########################################################################
source("../combining_two_df.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#==========================
###########################
# Data for PS Corr plots
# you need merged_df3_comp
# since these are matched
# to allow pairwise corr
###########################
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df3_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#===========================
# Plot: Correlation plots
#===========================
#===================
# Data for plots
#===================
#<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df = my_df
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(my_df)
# sanity checks
str(df)
table(df$DUET_outcome)
# unique positions
length(unique(df$Position)) #{RESULT: unique positions for comp data}
# subset data to generate pairwise correlations
corr_data = df[, c("ratioDUET"
# , "ratioPredAff"
# , "DUETStability_Kcalpermol"
# , "PredAffLog"
# , "OR"
, "logor"
# , "pvalue"
, "neglog10pvalue"
, "AF"
, "DUET_outcome"
# , "Lig_outcome"
, "pyrazinamide"
)]
dim(corr_data)
rm(df)
# assign nice colnames (for display)
my_corr_colnames = c("DUET"
# , "Ligand Affinity"
# , "DUET_raw"
# , "Lig_raw"
# , "OR"
, "Log(Odds Ratio)"
# , "P-value"
, "-LogP"
, "Allele Frequency"
, "DUET_outcome"
# , "Lig_outcome"
, "pyrazinamide")
# sanity check
if (length(my_corr_colnames) == length(corr_data)){
print("Sanity check passed: corr_data and corr_names match in length")
}else{
print("Error: length mismatch!")
}
colnames(corr_data)
colnames(corr_data) <- my_corr_colnames
colnames(corr_data)
###############
# PLOTS: corr
# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
###############
#default pairs plot
start = 1
end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column
offset = 1
my_corr = corr_data[start:(end-offset)]
head(my_corr)
#my_cols = c("#f8766d", "#00bfc4")
# deep blue :#007d85
# deep red: #ae301e
#==========
# psych: ionformative since it draws the ellipsoid
# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html
# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
#==========
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots"
getwd()
svg('DUET_corr.svg', width = 15, height = 15)
printFile = pairs.panels(my_corr[1:4]
, method = "spearman" # correlation method
, hist.col = "grey" ##00AFBB
, density = TRUE # show density plots
, ellipses = F # show correlation ellipses
, stars = T
, rug = F
, breaks = "Sturges"
, show.points = T
, bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$DUET_outcome))]
, pch = 21
, jitter = T
#, alpha = .05
#, points(pch = 19, col = c("#f8766d", "#00bfc4"))
, cex = 3
, cex.axis = 2.5
, cex.labels = 3
, cex.cor = 1
, smooth = F
)
print(printFile)
dev.off()

View file

@ -1,187 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
#source("barplot_colour_function.R")
########################################################################
# Read file: call script for combining df for lig #
########################################################################
source("../combining_two_df_lig.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for Lig Corr plots
# you need merged_df3_comp
# since these are matched
# to allow pairwise corr
###########################
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df3_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
#############################
# Extra sanity check:
# for mcsm_lig ONLY
# Dis_lig_Ang should be <10
#############################
if (max(my_df$Dis_lig_Ang) < 10){
print ("Sanity check passed: lig data is <10Ang")
}else{
print ("Error: data should be filtered to be within 10Ang")
}
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#===========================
# Plot: Correlation plots
#===========================
#===================
# Data for plots
#===================
#<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df = my_df
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(my_df)
# sanity checks
str(df)
table(df$Lig_outcome)
# unique positions
length(unique(df$Position)) #{RESULT: unique positions for comp data}
# subset data to generate pairwise correlations
corr_data = df[, c(#"ratioDUET",
"ratioPredAff"
# , "DUETStability_Kcalpermol"
# , "PredAffLog"
# , "OR"
, "logor"
# , "pvalue"
, "neglog10pvalue"
, "AF"
# , "DUET_outcome"
, "Lig_outcome"
, "pyrazinamide"
)]
dim(corr_data)
rm(df)
# assign nice colnames (for display)
my_corr_colnames = c(#"DUET",
"Ligand Affinity"
# ,"DUET_raw"
# , "Lig_raw"
# , "OR"
, "Log(Odds Ratio)"
# , "P-value"
, "-LogP"
, "Allele Frequency"
# , "DUET_outcome"
, "Lig_outcome"
, "pyrazinamide")
# sanity check
if (length(my_corr_colnames) == length(corr_data)){
print("Sanity check passed: corr_data and corr_names match in length")
}else{
print("Error: length mismatch!")
}
colnames(corr_data)
colnames(corr_data) <- my_corr_colnames
colnames(corr_data)
###############
# PLOTS: corr
# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
###############
# default pairs plot
start = 1
end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column
offset = 1
my_corr = corr_data[start:(end-offset)]
head(my_corr)
#my_cols = c("#f8766d", "#00bfc4")
# deep blue :#007d85
# deep red: #ae301e
#==========
# psych: ionformative since it draws the ellipsoid
# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html
# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
#==========
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots"
getwd()
svg('Lig_corr.svg', width = 15, height = 15)
printFile = pairs.panels(my_corr[1:4]
, method = "spearman" # correlation method
, hist.col = "grey" ##00AFBB
, density = TRUE # show density plots
, ellipses = F # show correlation ellipses
, stars = T
, rug = F
, breaks = "Sturges"
, show.points = T
, bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$Lig_outcome))]
, pch = 21
, jitter = T
# , alpha = .05
# , points(pch = 19, col = c("#f8766d", "#00bfc4"))
, cex = 3
, cex.axis = 2.5
, cex.labels = 3
, cex.cor = 1
, smooth = F
)
print(printFile)
dev.off()

View file

@ -1,227 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
#source("barplot_colour_function.R")
require(data.table)
########################################################################
# Read file: call script for combining df #
########################################################################
source("../combining_two_df.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#==========================
###########################
# Data for plots
# you need merged_df2, comprehensive one
# since this has one-many relationship
# i.e the same SNP can belong to multiple lineages
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df2
#my_df = merged_df2_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
is.factor(my_df$lineage)
my_df$lineage = as.factor(my_df$lineage)
is.factor(my_df$lineage)
#==========================
# Plot: Lineage barplot
# x = lineage y = No. of samples
# col = Lineage
# fill = lineage
#============================
table(my_df$lineage)
# lineage1 lineage2 lineage3 lineage4 lineage5 lineage6 lineageBOV
#3 104 1293 264 1311 6 6 105
#===========================
# Plot: Lineage Barplots
#===========================
#===================
# Data for plots
#===================
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df <- my_df
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(my_df)
# get freq count of positions so you can subset freq<1
#setDT(df)[, lineage_count := .N, by = .(lineage)]
#******************
# generate plot: barplot of mutation by lineage
#******************
sel_lineages = c("lineage1"
, "lineage2"
, "lineage3"
, "lineage4")
df_lin = subset(df, subset = lineage %in% sel_lineages )
#FIXME; add sanity check for numbers.
# Done this manually
############################################################
#########
# Data for barplot: Lineage barplot
# to show total samples and number of unique mutations
# within each linege
##########
# Create df with lineage inform & no. of unique mutations
# per lineage and total samples within lineage
# this is essentially barplot with two y axis
bar = bar = as.data.frame(sel_lineages) #4, 1
total_snps_u = NULL
total_samples = NULL
for (i in sel_lineages){
#print(i)
curr_total = length(unique(df$id)[df$lineage==i])
total_samples = c(total_samples, curr_total)
print(total_samples)
foo = df[df$lineage==i,]
print(paste0(i, "======="))
print(length(unique(foo$Mutationinformation)))
curr_count = length(unique(foo$Mutationinformation))
total_snps_u = c(total_snps_u, curr_count)
}
print(total_snps_u)
bar$num_snps_u = total_snps_u
bar$total_samples = total_samples
bar
#*****************
# generate plot: lineage barplot with two y-axis
#https://stackoverflow.com/questions/13035295/overlay-bar-graphs-in-ggplot2
#*****************
bar$num_snps_u = y1
bar$total_samples = y2
sel_lineages = x
to_plot = data.frame(x = x
, y1 = y1
, y2 = y2)
to_plot
melted = melt(to_plot, id = "x")
melted
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
svg('lineage_basic_barplot.svg')
my_ats = 20 # axis text size
my_als = 22 # axis label size
g = ggplot(melted
, aes(x = x
, y = value
, fill = variable)
)
printFile = g + geom_bar(
#g + geom_bar(
stat = "identity"
, position = position_stack(reverse = TRUE)
, alpha=.75
, colour='grey75'
) + theme(
axis.text.x = element_text(
size = my_ats
# , angle= 30
)
, axis.text.y = element_text(size = my_ats
#, angle = 30
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(
size = my_als
, colour = 'black'
)
, axis.title.y = element_text(
size = my_als
, colour = 'black'
)
, legend.position = "top"
, legend.text = element_text(size = my_als)
#) + geom_text(
) + geom_label(
aes(label = value)
, size = 5
, hjust = 0.5
, vjust = 0.5
, colour = 'black'
, show.legend = FALSE
#, check_overlap = TRUE
, position = position_stack(reverse = T)
#, position = ('
) + labs(
title = ''
, x = ''
, y = "Number"
, fill = 'Variable'
, colour = 'black'
) + scale_fill_manual(
values = c('grey50', 'gray75')
, name=''
, labels=c('Mutations', 'Total Samples')
) + scale_x_discrete(
breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
, labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
)
print(printFile)
dev.off()

View file

@ -1,233 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
#source("barplot_colour_function.R")
#require(data.table)
########################################################################
# Read file: call script for combining df for Lig #
########################################################################
source("../combining_two_df_lig.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for plots
# you need merged_df2 or merged_df2_comp
# since this is one-many relationship
# i.e the same SNP can belong to multiple lineages
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df2
#my_df = merged_df2_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
is.factor(my_df$lineage)
my_df$lineage = as.factor(my_df$lineage)
is.factor(my_df$lineage)
table(my_df$mutation_info)
#############################
# Extra sanity check:
# for mcsm_lig ONLY
# Dis_lig_Ang should be <10
#############################
if (max(my_df$Dis_lig_Ang) < 10){
print ("Sanity check passed: lig data is <10Ang")
}else{
print ("Error: data should be filtered to be within 10Ang")
}
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#==========================
# Plot: Lineage Distribution
# x = mcsm_values, y = dist
# fill = stability
#============================
#===================
# Data for plots
#===================
# subset only lineages1-4
sel_lineages = c("lineage1"
, "lineage2"
, "lineage3"
, "lineage4")
# uncomment as necessary
df_lin = subset(my_df, subset = lineage %in% sel_lineages ) #2037 35
# refactor
df_lin$lineage = factor(df_lin$lineage)
table(df_lin$lineage) #{RESULT: No of samples within lineage}
#lineage1 lineage2 lineage3 lineage4
#78 961 195 803
# when merged_df2_comp is used
#lineage1 lineage2 lineage3 lineage4
#77 955 194 770
length(unique(df_lin$Mutationinformation))
#{Result: No. of unique mutations the 4 lineages contribute to}
# sanity checks
r1 = 2:5 # when merged_df2 used: because there is missing lineages
if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
print ("sanity check passed: numbers match")
} else{
print("Error!: check your numbers")
}
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df <- df_lin
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(df_lin)
#******************
# generate distribution plot of lineages
#******************
# basic: could improve this!
library(plotly)
library(ggridges)
fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
g <- ggplot(df, aes(x = ratioPredAff)) +
geom_density(aes(fill = Lig_outcome)
, alpha = 0.5) +
facet_wrap( ~ lineage
, scales = "free"
, labeller = labeller(lineage = fooNames) ) +
coord_cartesian(xlim = c(-1, 1)
# , ylim = c(0, 6)
# , clip = "off"
)
ggtitle("Kernel Density estimates of Ligand affinity by lineage")
ggplotly(g)
# 2 : ggridges (good!)
my_ats = 15 # axis text size
my_als = 20 # axis label size
fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
svg('lineage_dist_LIG.svg')
printFile = ggplot( df, aes(x = ratioPredAff
, y = Lig_outcome) ) +
geom_density_ridges_gradient( aes(fill = ..x..)
, scale = 3
, size = 0.3 ) +
facet_wrap( ~lineage
, scales = "free"
# , switch = 'x'
, labeller = labeller(lineage = fooNames) ) +
coord_cartesian( xlim = c(-1, 1)
# , ylim = c(0, 6)
# , clip = "off"
) +
scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
, name = "Ligand Affinity" ) +
theme( axis.text.x = element_text( size = my_ats
, angle = 90
, hjust = 1
, vjust = 0.4)
# , axis.text.y = element_text( size = my_ats
# , angle = 0
# , hjust = 1
# , vjust = 0)
, axis.text.y = element_blank()
, axis.title.x = element_blank()
, axis.title.y = element_blank()
, axis.ticks.y = element_blank()
, plot.title = element_blank()
, strip.text = element_text(size = my_als)
, legend.text = element_text(size = 10)
, legend.title = element_text(size = my_als)
# , legend.position = c(0.3, 0.8)
# , legend.key.height = unit(1, 'mm')
)
print(printFile)
dev.off()
#=!=!=!=!=!=!
# COMMENT: When you look at all mutations, the lineage differences disappear...
# The pattern we are interested in is possibly only for dr_mutations
#=!=!=!=!=!=!
#===================================================
# COMPARING DISTRIBUTIONS
head(df$lineage)
df$lineage = as.character(df$lineage)
lin1 = df[df$lineage == "lineage1",]$ratioPredAff
lin2 = df[df$lineage == "lineage2",]$ratioPredAff
lin3 = df[df$lineage == "lineage3",]$ratioPredAff
lin4 = df[df$lineage == "lineage4",]$ratioPredAff
# ks test
ks.test(lin1,lin2)
ks.test(lin1,lin3)
ks.test(lin1,lin4)
ks.test(lin2,lin3)
ks.test(lin2,lin4)
ks.test(lin3,lin4)

View file

@ -1,212 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") # thinkpad
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
#source("barplot_colour_function.R")
#require(data.table)
########################################################################
# Read file: call script for combining df for PS #
########################################################################
source("../combining_two_df.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for plots
# you need merged_df2 or merged_df2_comp
# since this is one-many relationship
# i.e the same SNP can belong to multiple lineages
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df2
#my_df = merged_df2_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
is.factor(my_df$lineage)
my_df$lineage = as.factor(my_df$lineage)
is.factor(my_df$lineage)
table(my_df$mutation_info)
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#==========================
# Plot: Lineage Distribution
# x = mcsm_values, y = dist
# fill = stability
#============================
#===================
# Data for plots
#===================
# subset only lineages1-4
sel_lineages = c("lineage1"
, "lineage2"
, "lineage3"
, "lineage4")
# uncomment as necessary
df_lin = subset(my_df, subset = lineage %in% sel_lineages )
# refactor
df_lin$lineage = factor(df_lin$lineage)
table(df_lin$lineage) #{RESULT: No of samples within lineage}
#lineage1 lineage2 lineage3 lineage4
#104 1293 264 1311
# when merged_df2_comp is used
#lineage1 lineage2 lineage3 lineage4
#99 1275 263 1255
length(unique(df_lin$Mutationinformation))
#{Result: No. of unique mutations the 4 lineages contribute to}
# sanity checks
r1 = 2:5 # when merged_df2 used: because there is missing lineages
if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
print ("sanity check passed: numbers match")
} else{
print("Error!: check your numbers")
}
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df <- df_lin
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(df_lin)
#******************
# generate distribution plot of lineages
#******************
# basic: could improve this!
library(plotly)
library(ggridges)
g <- ggplot(df, aes(x = ratioDUET)) +
geom_density(aes(fill = DUET_outcome)
, alpha = 0.5) + facet_wrap(~ lineage,
scales = "free") +
ggtitle("Kernel Density estimates of Protein stability by lineage")
ggplotly(g)
# 2 : ggridges (good!)
my_ats = 15 # axis text size
my_als = 20 # axis label size
fooNames=c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
names(fooNames)=c('lineage1', 'lineage2', 'lineage3', 'lineage4')
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
svg('lineage_dist_PS.svg')
printFile = ggplot( df, aes(x = ratioDUET
, y = DUET_outcome) )+
#printFile=geom_density_ridges_gradient(
geom_density_ridges_gradient( aes(fill = ..x..)
, scale = 3
, size = 0.3 ) +
facet_wrap( ~lineage
, scales = "free"
# , switch = 'x'
, labeller = labeller(lineage = fooNames) ) +
coord_cartesian( xlim = c(-1, 1)
# , ylim = c(0, 6)
# , clip = "off"
) +
scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
, name = "DUET" ) +
theme( axis.text.x = element_text( size = my_ats
, angle = 90
, hjust = 1
, vjust = 0.4)
# , axis.text.y = element_text( size = my_ats
# , angle = 0
# , hjust = 1
# , vjust = 0)
, axis.text.y = element_blank()
, axis.title.x = element_blank()
, axis.title.y = element_blank()
, axis.ticks.y = element_blank()
, plot.title = element_blank()
, strip.text = element_text(size=my_als)
, legend.text = element_text(size=10)
, legend.title = element_text(size=my_als)
# , legend.position = c(0.3, 0.8)
# , legend.key.height = unit(1, 'mm')
)
print(printFile)
dev.off()
#=!=!=!=!=!=!
# COMMENT: When you look at all mutations, the lineage differences disappear...
# The pattern we are interested in is possibly only for dr_mutations
#=!=!=!=!=!=!
#===================================================
# COMPARING DISTRIBUTIONS
head(df$lineage)
df$lineage = as.character(df$lineage)
lin1 = df[df$lineage == "lineage1",]$ratioDUET
lin2 = df[df$lineage == "lineage2",]$ratioDUET
lin3 = df[df$lineage == "lineage3",]$ratioDUET
lin4 = df[df$lineage == "lineage4",]$ratioDUET
# ks test
ks.test(lin1,lin2)
ks.test(lin1,lin3)
ks.test(lin1,lin4)
ks.test(lin2,lin3)
ks.test(lin2,lin4)
ks.test(lin3,lin4)

View file

@ -1,27 +0,0 @@
#########################
#3: Read complex pdb file
##########################
source("Header_TT.R")
# This script only reads the pdb file of your complex
# read in pdb file complex1
inDir = "~/git/Data/pyrazinamide/input/structure/"
inFile = paste0(inDir, "complex1_no_water.pdb")
complex1 = inFile
#inFile2 = paste0(inDir, "complex2_no_water.pdb")
#complex2 = inFile2
# list of 8
my_pdb = read.pdb(complex1
, maxlines = -1
, multi = FALSE
, rm.insert = FALSE
, rm.alt = TRUE
, ATOM.only = FALSE
, hex = FALSE
, verbose = TRUE)
rm(inDir, inFile, complex1)
#====== end of script

View file

@ -1,386 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("Header_TT.R")
#########################################################
# TASK: replace B-factors in the pdb file with normalised values
# use the complex file with no water as mCSM lig was
# performed on this file. You can check it in the script: read_pdb file.
#########################################################
###########################
# 2: Read file: average stability values
# or mcsm_normalised file, output of step 4 mcsm pipeline
###########################
inDir = "~/git/Data/pyrazinamide/input/processed/"
inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
my_df <- read.csv(inFile
# , row.names = 1
# , stringsAsFactors = F
, header = T)
str(my_df)
#=========================================================
# Processing P1: Replacing B factor with mean ratioDUET scores
#=========================================================
#########################
# Read complex pdb file
# form the R script
##########################
source("read_pdb.R") # list of 8
# extract atom list into a variable
# since in the list this corresponds to data frame, variable will be a df
d = my_pdb[[1]]
# make a copy: required for downstream sanity checks
d2 = d
# sanity checks: B factor
max(d$b); min(d$b)
#*******************************************
# plot histograms for inspection
# 1: original B-factors
# 2: original DUET Scores
# 3: replaced B-factors with DUET Scores
#*********************************************
# Set the margin on all sides
par(oma = c(3,2,3,0)
, mar = c(1,3,5,2)
, mfrow = c(3,2))
#par(mfrow = c(3,2))
#1: Original B-factor
hist(d$b
, xlab = ""
, main = "B-factor")
plot(density(d$b)
, xlab = ""
, main = "B-factor")
# 2: DUET scores
hist(my_df$average_DUETR
, xlab = ""
, main = "Norm_DUET")
plot(density(my_df$average_DUETR)
, xlab = ""
, main = "Norm_DUET")
# 3: After the following replacement
#********************************
#=========
# step 0_P1: DONT RUN once you have double checked the matched output
#=========
# sanity check: match and assign to a separate column to double check
# colnames(my_df)
# d$ratioDUET = my_df$averge_DUETR[match(d$resno, my_df$Position)]
#=========
# step 1_P1
#=========
# Be brave and replace in place now (don't run sanity check)
# this makes all the B-factor values in the non-matched positions as NA
d$b = my_df$average_DUETR[match(d$resno, my_df$Position)]
#=========
# step 2_P1
#=========
# count NA in Bfactor
b_na = sum(is.na(d$b)) ; b_na
# count number of 0's in Bactor
sum(d$b == 0)
#table(d$b)
# replace all NA in b factor with 0
d$b[is.na(d$b)] = 0
# sanity check: should be 0
sum(is.na(d$b))
# sanity check: should be True
if (sum(d$b == 0) == b_na){
print ("Sanity check passed: NA's replaced with 0's successfully")
} else {
print("Error: NA replacement NOT successful, Debug code!")
}
max(d$b); min(d$b)
# sanity checks: should be True
if(max(d$b) == max(my_df$average_DUETR)){
print("Sanity check passed: B-factors replaced correctly")
} else {
print ("Error: Debug code please")
}
if (min(d$b) == min(my_df$average_DUETR)){
print("Sanity check passed: B-factors replaced correctly")
} else {
print ("Error: Debug code please")
}
#=========
# step 3_P1
#=========
# sanity check: dim should be same before reassignment
# should be TRUE
dim(d) == dim(d2)
#=========
# step 4_P1
#=========
# assign it back to the pdb file
my_pdb[[1]] = d
max(d$b); min(d$b)
#=========
# step 5_P1
#=========
# output dir
getwd()
outDir = "~/git/Data/pyrazinamide/input/structure/"
outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
write.pdb(my_pdb, outFile)
#********************************
# Add the 3rd histogram and density plots for comparisons
#********************************
# Plots continued...
# 3: hist and density of replaced B-factors with DUET Scores
hist(d$b
, xlab = ""
, main = "repalced-B")
plot(density(d$b)
, xlab = ""
, main = "replaced-B")
# graph titles
mtext(text = "Frequency"
, side = 2
, line = 0
, outer = TRUE)
mtext(text = "DUET_stability"
, side = 3
, line = 0
, outer = TRUE)
#********************************
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# NOTE: This replaced B-factor distribution has the same
# x-axis as the PredAff normalised values, but the distribution
# is affected since 0 is overinflated. This is because all the positions
# where there are no SNPs have been assigned 0.
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#######################################################################
#====================== end of section 1 ==============================
#######################################################################
#=========================================================
# Processing P2: Replacing B values with PredAff Scores
#=========================================================
# clear workspace
rm(list = ls())
###########################
# 2: Read file: average stability values
# or mcsm_normalised file, output of step 4 mcsm pipeline
###########################
inDir = "~/git/Data/pyrazinamide/input/processed/"
inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
my_df <- read.csv(inFile
# , row.names = 1
# , stringsAsFactors = F
, header = T)
str(my_df)
#rm(inDir, inFile)
#########################
# 3: Read complex pdb file
# form the R script
##########################
source("read_pdb.R") # list of 8
# extract atom list into a variable
# since in the list this corresponds to data frame, variable will be a df
d = my_pdb[[1]]
# make a copy: required for downstream sanity checks
d2 = d
# sanity checks: B factor
max(d$b); min(d$b)
#*******************************************
# plot histograms for inspection
# 1: original B-factors
# 2: original Pred Aff Scores
# 3: replaced B-factors with PredAff Scores
#********************************************
# Set the margin on all sides
par(oma = c(3,2,3,0)
, mar = c(1,3,5,2)
, mfrow = c(3,2))
#par(mfrow = c(3,2))
# 1: Original B-factor
hist(d$b
, xlab = ""
, main = "B-factor")
plot(density(d$b)
, xlab = ""
, main = "B-factor")
# 2: Pred Aff scores
hist(my_df$average_PredAffR
, xlab = ""
, main = "Norm_lig_average")
plot(density(my_df$average_PredAffR)
, xlab = ""
, main = "Norm_lig_average")
# 3: After the following replacement
#********************************
#=================================================
# Processing P2: Replacing B values with ratioPredAff scores
#=================================================
# use match to perform this replacement linking with "position no"
# in the pdb file, this corresponds to column "resno"
# in my_df, this corresponds to column "Position"
#=========
# step 0_P2: DONT RUN once you have double checked the matched output
#=========
# sanity check: match and assign to a separate column to double check
# colnames(my_df)
# d$ratioPredAff = my_df$average_PredAffR[match(d$resno, my_df$Position)] #1384, 17
#=========
# step 1_P2: BE BRAVE and replace in place now (don't run step 0)
#=========
# this makes all the B-factor values in the non-matched positions as NA
d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)]
#=========
# step 2_P2
#=========
# count NA in Bfactor
b_na = sum(is.na(d$b)) ; b_na
# count number of 0's in Bactor
sum(d$b == 0)
#table(d$b)
# replace all NA in b factor with 0
d$b[is.na(d$b)] = 0
# sanity check: should be 0
sum(is.na(d$b))
if (sum(d$b == 0) == b_na){
print ("Sanity check passed: NA's replaced with 0's successfully")
} else {
print("Error: NA replacement NOT successful, Debug code!")
}
max(d$b); min(d$b)
# sanity checks: should be True
if (max(d$b) == max(my_df$average_PredAffR)){
print("Sanity check passed: B-factors replaced correctly")
} else {
print ("Error: Debug code please")
}
if (min(d$b) == min(my_df$average_PredAffR)){
print("Sanity check passed: B-factors replaced correctly")
} else {
print ("Error: Debug code please")
}
#=========
# step 3_P2
#=========
# sanity check: dim should be same before reassignment
# should be TRUE
dim(d) == dim(d2)
#=========
# step 4_P2
#=========
# assign it back to the pdb file
my_pdb[[1]] = d
max(d$b); min(d$b)
#=========
# step 5_P2
#=========
# output dir
outDir = "~/git/Data/pyrazinamide/input/structure/"
outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile
write.pdb(my_pdb, outFile)
#********************************
# Add the 3rd histogram and density plots for comparisons
#********************************
# Plots continued...
# 3: hist and density of replaced B-factors with PredAff Scores
hist(d$b
, xlab = ""
, main = "repalced-B")
plot(density(d$b)
, xlab = ""
, main = "replaced-B")
# graph titles
mtext(text = "Frequency"
, side = 2
, line = 0
, outer = TRUE)
mtext(text = "Lig_stability"
, side = 3
, line = 0
, outer = TRUE)
#********************************
###########
# end of output files with Bfactors
##########

View file

@ -1,257 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
getwd()
#########################################################
# 1: Installing and loading required packages #
#########################################################
source("Header_TT.R")
#source("barplot_colour_function.R")
##########################################################
# Checking: Entire data frame and for PS #
##########################################################
###########################
#2) Read file: combined one from the script
###########################
source("combining_two_df.R")
# df with NA:
# merged_df2
# merged_df3:
# df without NA:
# merged_df2_comp:
# merged_df3_comp:
######################
# You need to check it
# with the merged_df3
########################
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
my_df = merged_df3
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#clear variables
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# should be true
identical(my_df$Position, my_df$position)
#################################
# Read file: normalised file
# output of step 4 mcsm_pipeline
#################################
inDir = "~/git/Data/pyrazinamide/input/processed/"
inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
mcsm_data <- read.csv(inFile
, row.names = 1
, stringsAsFactors = F
, header = T)
str(mcsm_data)
my_colnames = colnames(mcsm_data)
#====================================
# subset my_df to include only the columns in mcsm data
my_df2 = my_df[my_colnames]
#====================================
# compare the two
head(mcsm_data$Mutationinformation)
head(mcsm_data$Position)
head(my_df2$Mutationinformation)
head(my_df2$Position)
# sort mcsm data by Mutationinformation
mcsm_data_s = mcsm_data[order(mcsm_data$Mutationinformation),]
head(mcsm_data_s$Mutationinformation)
head(mcsm_data_s$Position)
# now compare: should be True, but is false....
# possibly due to rownames!?!
identical(mcsm_data_s, my_df2)
# from library dplyr
setdiff(mcsm_data_s, my_df2)
#from lib compare
compare(mcsm_data_s, my_df2) # seems rownames are the problem
# FIXME: automate this
# write files: checked using meld and files are indeed identical
#write.csv(mcsm_data_s, "mcsm_data_s.csv", row.names = F)
#write.csv(my_df2, "my_df2.csv", row.names = F)
#====================================================== end of section 1
##########################################################
# Checking: LIG(Filtered dataframe) #
##########################################################
# clear workspace
rm(list = ls())
###########################
#3) Read file: combined_lig from the script
###########################
source("combining_two_df_lig.R")
# df with NA:
# merged_df2 :
# merged_df3:
# df without NA:
# merged_df2_comp:
# merged_df3_comp:
######################
# You need to check it
# with the merged_df3
########################
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
my_df = merged_df3
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#clear variables
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# should be true
identical(my_df$Position, my_df$position)
#################################
# Read file: normalised file
# output of step 4 mcsm_pipeline
#################################
inDir = "~/git/Data/pyrazinamide/input/processed/"
inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
mcsm_data <- read.csv(inFile
, row.names = 1
, stringsAsFactors = F
, header = T)
str(mcsm_data)
###########################
# 4a: Filter/subset data: ONLY for LIGand analysis
# Lig plots < 10Ang
# Filter the lig plots for Dis_to_lig < 10Ang
###########################
# sanity checks
upos = unique(mcsm_data$Position)
# check range of distances
max(mcsm_data$Dis_lig_Ang)
min(mcsm_data$Dis_lig_Ang)
# Lig filtered: subset data to have only values less than 10 Ang
mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
rm(mcsm_data) #to avoid confusion
table(mcsm_data2$Dis_lig_Ang<10)
table(mcsm_data2$Dis_lig_Ang>10)
max(mcsm_data2$Dis_lig_Ang)
min(mcsm_data2$Dis_lig_Ang)
upos_f = unique(mcsm_data2$Position); upos_f
# colnames of df that you will need to subset the bigger df from
my_colnames = colnames(mcsm_data2)
#====================================
# subset bigger df i.e my_df to include only the columns in mcsm data2
my_df2 = my_df[my_colnames]
rm(my_df) #to avoid confusion
#====================================
# compare the two
head(mcsm_data2$Mutationinformation)
head(mcsm_data2$Position)
head(my_df2$Mutationinformation)
head(my_df2$Position)
# sort mcsm data by Mutationinformation
mcsm_data2_s = mcsm_data2[order(mcsm_data2$Mutationinformation),]
head(mcsm_data2_s$Mutationinformation)
head(mcsm_data2_s$Position)
# now compare: should be True, but is false....
# possibly due to rownames!?!
identical(mcsm_data2_s, my_df2)
# from library dplyr
setdiff(mcsm_data2_s, my_df2)
# from library compare
compare(mcsm_data2_s, my_df2) # seems rownames are the problem
#FIXME: automate this
# write files: checked using meld and files are indeed identical
#write.csv(mcsm_data2_s, "mcsm_data2_s.csv", row.names = F)
#write.csv(my_df2, "my_df2.csv", row.names = F)
##########################################################
# extract and write output file for SNP posn: all #
##########################################################
head(merged_df3$Position)
foo = merged_df3[order(merged_df3$Position),]
head(foo$Position)
snp_pos_unique = unique(foo$Position); snp_pos_unique
# sanity check:
table(snp_pos_unique == combined_df$Position)
#=====================
# write_output files
#=====================
outDir = "~/Data/pyrazinamide/input/processed/"
outFile1 = paste0(outDir, "snp_pos_unique.txt"); outFile1
print(paste0("Output file name and path will be:","", outFile1))
write.table(snp_pos_unique
, outFile1
, row.names = F
, col.names = F)
##############################################################
# extract and write output file for SNP posn: complete only #
##############################################################
head(merged_df3_comp$Position)
foo = merged_df3_comp[order(merged_df3_comp$Position),]
head(foo$Position)
snp_pos_unique = unique(foo$Position); snp_pos_unique
# outDir = "~/Data/pyrazinamide/input/processed/" # already set
outFile2 = paste0(outDir, "snp_pos_unique_comp.txt")
print(paste0("Output file name and path will be:", outFile2))
write.table(snp_pos_unique
, outFile2
, row.names = F
, col.names = F)
#============================== end of script

56
mcsm_na/examples.py Executable file
View file

@ -0,0 +1,56 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 12 12:15:26 2021
@author: tanu
"""
import os
homedir = os.path.expanduser('~')
os.chdir (homedir + '/git/LSHTM_analysis/mcsm_na')
from submit_mcsm_na import *
from get_results_mcsm_na import *
#%%#####################################################################
#EXAMPLE RUN for different stages
#=====================
# STAGE: submit_mcsm_na.py
#=====================
my_host = 'http://biosig.unimelb.edu.au'
my_prediction_url = f"{my_host}/mcsm_na/run_prediction_list"
print(my_prediction_url)
my_outdir = homedir + '/git/LSHTM_analysis/mcsm_na'
my_nuc_type = 'RNA'
my_pdb_file = homedir + '/git/Data/streptomycin/input/gid_complex.pdb'
my_mutation_list = homedir + '/git/LSHTM_analysis/mcsm_na/test_snps_b1.csv'
my_suffix = 'TEST'
#----------------------------------------------
# example 1: 2 snps in a file
#----------------------------------------------
submit_mcsm_na(host_url = my_host
, pdb_file = my_pdb_file
, mutation_list = my_mutation_list
, nuc_type = my_nuc_type
, prediction_url = my_prediction_url
, output_dir = my_outdir
, outfile_suffix = my_suffix)
#%%###################################################################
#=====================
# STAGE: get_results.py
#=====================
my_host = 'http://biosig.unimelb.edu.au'
my_outdir = homedir + '/git/LSHTM_analysis/mcsm_na'
#----------------------------------------------
# example 1: single url in a single file
#----------------------------------------------
my_url_file_single = homedir + '/git/LSHTM_analysis/mcsm_na/mcsm_na_temp/mcsm_na_result_url_gid_test_b1.txt'
print(my_url_file_single)
my_suffix = 'single'
get_results(url_file = my_url_file_single
, host_url = my_host
, output_dir = my_outdir
, outfile_suffix = my_suffix)

135
mcsm_na/format_results_mcsm_na.py Executable file
View file

@ -0,0 +1,135 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 19 14:33:51 2020
@author: tanu
"""
#%% load packages
import os,sys
import subprocess
import argparse
import requests
import re
import time
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
#%%#####################################################################
def format_mcsm_na_output(mcsm_na_output_tsv):
"""
@param mcsm_na_outputcsv: file containing mcsm_na_results for all muts
which is the result of combining all mcsm_na batch results, and using
bash scripts to combine all the batch results into one file.
This is post run_get_results_mcsm_na.py
Formatting df to a pandas df and output as csv.
@type string
@return (not true) formatted csv for mcsm_na output
@type pandas df
"""
#############
# Read file
#############
mcsm_na_data_raw = pd.read_csv(mcsm_na_output_tsv, sep = '\t')
# strip white space from both ends in all columns
mcsm_na_data = mcsm_na_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
dforig_shape = mcsm_na_data.shape
print('dimensions of input file:', dforig_shape)
#############
# rename cols
#############
# format colnames: all lowercase and consistent colnames
mcsm_na_data.columns
print('Assigning meaningful colnames'
, '\n=======================================================')
my_colnames_dict = {'PDB_FILE': 'pdb_file' # relevant info from this col will be extracted and the column discarded
, 'CHAIN': 'chain' # {wild_type}<position>{mutant_type}
, 'WILD_RES': 'wild_type' # one letter amino acid code
, 'RES_POS': 'position' # number
, 'MUT_RES': 'mutant_type' # one letter amino acid code
, 'RSA': 'rsa' # single letter (caps)
, 'PRED_DDG': 'mcsm_na_affinity'} # 3-letter code
mcsm_na_data.rename(columns = my_colnames_dict, inplace = True)
mcsm_na_data.columns
#%%============================================================================
#############
# create mutationinformation column
#############
mcsm_na_data['mutationinformation'] = mcsm_na_data['wild_type'] + mcsm_na_data.position.map(str) + mcsm_na_data['mutant_type']
#%%=====================================================================
#############
# Create col: mcsm_na_outcome
#############
# classification based on mcsm_na_affinity values
print('Assigning col: mcsm_na_outcome based on mcsm_na_affinity')
print('Sanity check:')
# count positive values in the mcsm_na_affinity column
c = mcsm_na_data[mcsm_na_data['mcsm_na_affinity']>=0].count()
mcsm_na_pos = c.get(key = 'mcsm_na_affinity')
# Assign category based on sign (+ve : I_affinity, -ve: R_affinity)
mcsm_na_data['mcsm_na_outcome'] = np.where(mcsm_na_data['mcsm_na_affinity']>=0, 'Increased_affinity', 'Reduced_affinity')
print('mcsm_na Outcome:', mcsm_na_data['mcsm_na_outcome'].value_counts())
#if mcsm_na_pos == mcsm_na_data['mcsm_na_outcome'].value_counts()['Increased_affinity']:
# print('PASS: mcsm_na_outcome assigned correctly')
#else:
# print('FAIL: mcsm_na_outcome assigned incorrectly'
# , '\nExpected no. of Increased_affinity mutations:', mcsm_na_pos
# , '\nGot no. of Increased affinity mutations', mcsm_na_data['mcsm_na_outcome'].value_counts()['Increased_affinity']
# , '\n======================================================')
#%%=====================================================================
#############
# scale mcsm_na values
#############
# Rescale values in mcsm_na_affinity col b/w -1 and 1 so negative numbers
# stay neg and pos numbers stay positive
mcsm_na_min = mcsm_na_data['mcsm_na_affinity'].min()
mcsm_na_max = mcsm_na_data['mcsm_na_affinity'].max()
mcsm_na_scale = lambda x : x/abs(mcsm_na_min) if x < 0 else (x/mcsm_na_max if x >= 0 else 'failed')
mcsm_na_data['mcsm_na_scaled'] = mcsm_na_data['mcsm_na_affinity'].apply(mcsm_na_scale)
print('Raw mcsm_na scores:\n', mcsm_na_data['mcsm_na_affinity']
, '\n---------------------------------------------------------------'
, '\nScaled mcsm_na scores:\n', mcsm_na_data['mcsm_na_scaled'])
c2 = mcsm_na_data[mcsm_na_data['mcsm_na_scaled']>=0].count()
mcsm_na_pos2 = c2.get(key = 'mcsm_na_affinity')
if mcsm_na_pos == mcsm_na_pos2:
print('\nPASS: Affinity values scaled correctly')
else:
print('\nFAIL: Affinity values scaled numbers MISmatch'
, '\nExpected number:', mcsm_na_pos
, '\nGot:', mcsm_na_pos2
, '\n======================================================')
#%%=====================================================================
#############
# reorder columns
#############
mcsm_na_data.columns
mcsm_na_dataf = mcsm_na_data[['mutationinformation'
, 'mcsm_na_affinity'
, 'mcsm_na_scaled'
, 'mcsm_na_outcome'
, 'rsa'
, 'wild_type'
, 'position'
, 'mutant_type'
, 'chain'
, 'pdb_file']]
return(mcsm_na_dataf)
#%%#####################################################################

52
mcsm_na/get_results_mcsm_na.py Executable file
View file

@ -0,0 +1,52 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 19 14:33:51 2020
@author: tanu
"""
#%% load packages
import os,sys
import subprocess
import argparse
import requests
import re
import time
from bs4 import BeautifulSoup
import pandas as pd
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
#%%#####################################################################
def get_results(url_file, host_url, output_dir, outfile_suffix):
# initilialise empty df
#mcsm_na_results_out_df = pd.DataFrame()
with open(url_file, 'r') as f:
for count, line in enumerate(f):
line = line.strip()
print('URL no.', count+1, '\n', line)
#============================
# Writing results file: csv
#============================
mcsm_na_results_dir = output_dir + '/mcsm_na_results'
if not os.path.exists(mcsm_na_results_dir):
print('\nCreating dir: mcsm_na_results within:', output_dir )
os.makedirs(mcsm_na_results_dir)
# Download the .txt
prediction_number = re.search(r'([0-9]+\.[0-9]+$)', line).group(0)
print('CHECK prediction no:', prediction_number)
txt_url = f"{host_url}/mcsm_na/static/results/" + prediction_number + '.txt'
print('CHECK txt url:', txt_url)
out_filename = mcsm_na_results_dir + '/' + outfile_suffix + '_output_' + prediction_number + '.txt.gz'
response_txt = requests.get(txt_url, stream = True)
if response_txt.status_code == 200:
print('\nDownloading .txt:', txt_url
, '\n\nSaving file as:', out_filename)
with open(out_filename, 'wb') as f:
f.write(response_txt.raw.read())
#%%#####################################################################

View file

@ -0,0 +1 @@
http://biosig.unimelb.edu.au/mcsm_na/results_prediction/1613147445.16

View file

@ -0,0 +1,78 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 12 12:15:26 2021
@author: tanu
"""
#%% load packages
import os
homedir = os.path.expanduser('~')
os.chdir (homedir + '/git/LSHTM_analysis/mcsm_na')
from format_results_mcsm_na import *
########################################################################
#%% command line args
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug' , help = 'drug name (case sensitive)', default = None)
arg_parser.add_argument('-g', '--gene' , help = 'gene name (case sensitive)', default = None)
arg_parser.add_argument('--datadir' , help = 'Data Directory. By default, it assmumes homedir + git/Data')
arg_parser.add_argument('-i', '--input_dir' , help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
#arg_parser.add_argument('--mkdir_name' , help = 'Output dir for processed results. This will be created if it does not exist')
arg_parser.add_argument('-m', '--make_dirs' , help = 'Make dir for input and output', action='store_true')
arg_parser.add_argument('--debug' , action = 'store_true' , help = 'Debug Mode')
args = arg_parser.parse_args()
#%%============================================================================
# variable assignment: input and output paths & filenames
drug = args.drug
gene = args.gene
datadir = args.datadir
indir = args.input_dir
outdir = args.output_dir
#outdir_ppi2 = args.mkdir_name
make_dirs = args.make_dirs
#=======
# dirs
#=======
if not datadir:
datadir = homedir + '/git/Data/'
if not indir:
indir = datadir + drug + '/input/'
if not outdir:
outdir = datadir + drug + '/output/'
#if not mkdir_name:
# outdir_na = outdir + 'mcsm_na_results/'
outdir_na = outdir + 'mcsm_na_results/'
# Input file
infile_mcsm_na = outdir_na + gene.lower() + '_output_combined_clean.tsv'
# Formatted output file
outfile_mcsm_na_f = outdir_na + gene.lower() + '_complex_mcsm_na_norm.csv'
#===========================================
# CALL: format_results_mcsm_na()
# Data: gid+streptomycin
# Data: rpob+rifampicin, date: 18/11/2021
#===========================================
print('Formatting results for:', infile_mcsm_na)
mcsm_na_df_f = format_mcsm_na_output(mcsm_na_output_tsv = infile_mcsm_na)
# writing file
print('Writing formatted df to csv')
mcsm_na_df_f.to_csv(outfile_mcsm_na_f, index = False)
print('Finished writing file:'
, '\nFile:', outfile_mcsm_na_f
, '\nExpected no. of rows:', len(mcsm_na_df_f)
, '\nExpected no. of cols:', len(mcsm_na_df_f.columns)
, '\n=============================================================')
#%%#####################################################################

View file

@ -0,0 +1,42 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 12 12:15:26 2021
@author: tanu
"""
#%% load packages
import os
homedir = os.path.expanduser('~')
os.chdir (homedir + '/git/LSHTM_analysis/mcsm_na')
from get_results_mcsm_na import *
########################################################################
# variables
my_host = 'http://biosig.unimelb.edu.au'
# TODO: add cmd line args
#gene = 'gid'
drug = 'streptomycin'
datadir = homedir + '/git/Data'
indir = datadir + '/' + drug + '/input'
outdir = datadir + '/' + drug + '/output'
#==============================================================================
# batch 26: 25.txt, RETRIEVED: 16 Feb:
# batch 27: 26.txt, RETRIEVED: 6 Aug:
my_url_file = outdir + '/mcsm_na_temp/mcsm_na_result_url_gid_b27.txt'
my_suffix = 'gid_b27'
#==============================================================================
#==========================
# CALL: get_results()
# Data: gid+streptomycin
#==========================
print('Downloading results for:', my_url_file, '\nsuffix:', my_suffix)
get_results(url_file = my_url_file
, host_url = my_host
, output_dir = outdir
, outfile_suffix = my_suffix)
#%%#####################################################################

49
mcsm_na/run_submit_mcsm_na.py Executable file
View file

@ -0,0 +1,49 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 12 12:15:26 2021
@author: tanu
"""
#%% load packages
import os
homedir = os.path.expanduser('~')
os.chdir (homedir + '/git/LSHTM_analysis/mcsm_na')
from submit_mcsm_na import *
########################################################################
# variables
my_host = 'http://biosig.unimelb.edu.au'
my_prediction_url = f"{my_host}/mcsm_na/run_prediction_list"
print(my_prediction_url)
# TODO: add cmd line args
#gene = 'gid'
drug = ''
datadir = homedir + '/git/Data/'
indir = datadir + drug + 'input/'
outdir = datadir + drug + 'output/'
outdir_mcsm_na = outdir + 'mcsm_na_results/'
my_nuc_type = 'RNA'
my_pdb_file = indir + gene.lower() + '_complex.pdb'
#=============================================================================
# batch 26: 25.txt # RAN: 16 Feb:
# batch 27: 26.txt # RAN: 6 Aug:
# off by one
my_mutation_list = outdir + '/snp_batches/20/snp_batch_26.txt'
my_suffix = 'gid_b27'
#==============================================================================
#==========================
# CALL: submit_mcsm_na()
# Data: gid+streptomycin
#==========================
submit_mcsm_na(host_url = my_host
, pdb_file = my_pdb_file
, mutation_list = my_mutation_list
, nuc_type = my_nuc_type
, prediction_url = my_prediction_url
, output_dir = outdir_mcsm_na
, outfile_suffix = my_suffix)
#%%#####################################################################

27
mcsm_na/split_csv.sh Executable file
View file

@ -0,0 +1,27 @@
#!/bin/bash
# FIXME: This is written for expediency to kickstart running dynamut and mcsm-NA
# Usage: ~/git/LSHTM_analysis/dynamut/split_csv.sh <input file> <output dir> <chunk size in lines>
# copy your snp file to split into the mcsm_na dir
INFILE=$1
OUTDIR=$2
CHUNK=$3
mkdir -p ${OUTDIR}/${CHUNK}
cd ${OUTDIR}/${CHUNK}
split ../../${INFILE} -l ${CHUNK} -d snp_batch_
# use case
#~/git/LSHTM_analysis/mcsm_na/split_csv.sh gid_mcsm_formatted_snps.csv snp_batches 50
#~/git/LSHTM_analysis/mcsm_na/split_csv.sh embb_mcsm_formatted_snps.csv snp_batches 50
#~/git/LSHTM_analysis/mcsm_na/split_csv.sh rpob_mcsm_formatted_snps_chain.csv snp_batches 20 # date: 17/11/2021
#acccidently replaced file original rpob batches
#~/git/LSHTM_analysis/mcsm_na/split_csv.sh 5uhc_mcsm_formatted_snps_chain.csv snp_batches_5uhc 20 # date: 17/11/2021

19
mcsm_na/split_format_csv.sh Executable file
View file

@ -0,0 +1,19 @@
#!/bin/bash
# FIXME: This is written for expediency to kickstart running dynamut and mcsm-NA
# Usage: ~/git/LSHTM_analysis/dynamut/split_csv.sh <input file> <output dir> <chunk size in lines>
# copy your snp file to split into the mcsm_na dir
INFILE=$1
OUTDIR=$2
CHUNK=$3
mkdir -p ${OUTDIR}/${CHUNK}
cd ${OUTDIR}/${CHUNK}
split ../../${INFILE} -l ${CHUNK} -d snp_batch_
for i in *; do mv $i $i.txt; done
sed -i 's/^/A /g' *.txt

84
mcsm_na/submit_mcsm_na.py Executable file
View file

@ -0,0 +1,84 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 19 14:33:51 2020
@author: tanu
"""
#%% load packages
import os,sys
import subprocess
import argparse
import requests
import re
import time
from bs4 import BeautifulSoup
import pandas as pd
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
#%%#####################################################################
def submit_mcsm_na(host_url
, pdb_file
, mutation_list
, nuc_type
, prediction_url
, output_dir
, outfile_suffix
):
"""
Makes a POST request for mcsm_na predictions.
@param host_url: valid host url for submitting the job
@type string
@param pdb_file: valid path to pdb structure
@type string
@param mutation_list: list of mutations (1 per line) of the format:{chain} {WT}<POS>{Mut} [A X1Z}
@type string
@param nuc_type: Nucleic acid type
@type string
@param prediction_url: mcsm_na url for prediction
@type string
@param output_dir: output dir
@type string
@param outfile_suffix: outfile_suffix
@type string
@return writes a .txt file containing url for the snps processed with user provided suffix in filename
@type string
"""
with open(pdb_file, "rb") as pdb_file, open (mutation_list, "rb") as mutation_list:
files = {"wild": pdb_file
, "mutation_list": mutation_list}
body = {"na_type": nuc_type
,"pred_type": 'list',
"pdb_code": ''} # apparently needs it even though blank!
response = requests.post(prediction_url, files = files, data = body)
print(response.status_code)
if response.history:
print('\nPASS: valid submission. Fetching result url')
url_match = re.search('/mcsm_na/results_prediction/.+(?=")', response.text)
url = host_url + url_match.group()
print('\nURL for snp batch no ', str(outfile_suffix), ':', url)
#===============
# writing file: result urls
#===============
mcsm_na_temp_dir = output_dir + '/mcsm_na_temp' # creates a temp dir within output_dir
if not os.path.exists(mcsm_na_temp_dir):
print('\nCreating mcsm_na_temp in output_dir', output_dir )
os.makedirs(mcsm_na_temp_dir)
out_url_file = mcsm_na_temp_dir + '/mcsm_na_result_url_' + str(outfile_suffix) + '.txt'
print('\nWriting output url file:', out_url_file)
myfile = open(out_url_file, 'a')
myfile.write(url)
myfile.close()
#%%#####################################################################

2
mcsm_na/test_snps_b1.csv Normal file
View file

@ -0,0 +1,2 @@
A P3S
A I4N
1 A P3S
2 A I4N

View file

@ -0,0 +1,158 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 19 14:33:51 2020
@author: tanu
"""
#%% load packages
import os,sys
homedir = os.path.expanduser('~')
import subprocess
import argparse
import requests
import re
import time
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
sys.path.append(homedir + '/git/LSHTM_analysis/scripts')
from reference_dict import up_3letter_aa_dict
from reference_dict import oneletter_aa_dict
#%%============================================================================
def format_mcsm_ppi2_output(mcsm_ppi2_output_csv):
"""
@param mcsm_ppi2_output_csv: file containing mcsm_ppi2_results for all mcsm snps
which is the result of combining all mcsm_ppi2 batch results, and using
bash scripts to combine all the batch results into one file.
Formatting df to a pandas df and output as csv.
@type string
@return (not true) formatted csv for mcsm_ppi2 output
@type pandas df
"""
#############
# Read file
#############
mcsm_ppi2_data_raw = pd.read_csv(mcsm_ppi2_output_csv, sep = ',')
# strip white space from both ends in all columns
mcsm_ppi2_data = mcsm_ppi2_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
dforig_shape = mcsm_ppi2_data.shape
print('dimensions of input file:', dforig_shape)
#############
# Map 3 letter
# code to one
#############
# initialise a sub dict that is lookup dict for
# 3-LETTER aa code to 1-LETTER aa code
lookup_dict = dict()
for k, v in up_3letter_aa_dict.items():
lookup_dict[k] = v['one_letter_code']
wt = mcsm_ppi2_data['wild-type'].squeeze() # converts to a series that map works on
mcsm_ppi2_data['w_type'] = wt.map(lookup_dict)
mut = mcsm_ppi2_data['mutant'].squeeze()
mcsm_ppi2_data['m_type'] = mut.map(lookup_dict)
# #############
# # CHECK
# # Map 1 letter
# # code to 3Upper
# #############
# # initialise a sub dict that is lookup dict for
# # 3-LETTER aa code to 1-LETTER aa code
# lookup_dict = dict()
# for k, v in oneletter_aa_dict.items():
# lookup_dict[k] = v['three_letter_code_upper']
# wt = mcsm_ppi2_data['w_type'].squeeze() #converts to a series that map works on
# mcsm_ppi2_data['WILD'] = wt.map(lookup_dict)
# mut = mcsm_ppi2_data['m_type'].squeeze()
# mcsm_ppi2_data['MUT'] = mut.map(lookup_dict)
# # check
# mcsm_ppi2_data['wild-type'].equals(mcsm_ppi2_data['WILD'])
# mcsm_ppi2_data['mutant'].equals(mcsm_ppi2_data['MUT'])
#%%============================================================================
#############
# rename cols
#############
# format colnames: all lowercase and consistent colnames
mcsm_ppi2_data.columns
print('Assigning meaningful colnames'
, '\n=======================================================')
my_colnames_dict = {'chain': 'chain'
, 'wild-type': 'wt_upper'
, 'res-number': 'position'
, 'mutant': 'mut_upper'
, 'distance-to-interface': 'interface_dist'
, 'mcsm-ppi2-prediction': 'mcsm_ppi2_affinity'
, 'affinity': 'mcsm_ppi2_outcome'
, 'w_type': 'wild_type' # one letter amino acid code
, 'm_type': 'mutant_type' # one letter amino acid code
}
mcsm_ppi2_data.rename(columns = my_colnames_dict, inplace = True)
mcsm_ppi2_data.columns
#############
# create mutationinformation column
#############
#mcsm_ppi2_data['mutationinformation'] = mcsm_ppi2_data['wild_type'] + mcsm_ppi2_data.position.map(str) + mcsm_ppi2_data['mutant_type']
mcsm_ppi2_data['mutationinformation'] = mcsm_ppi2_data.loc[:,'wild_type'] + mcsm_ppi2_data.loc[:,'position'].astype(int).apply(str) + mcsm_ppi2_data.loc[:,'mutant_type']
#%%=====================================================================
#########################
# scale mcsm_ppi2 values
#########################
# Rescale values in mcsm_ppi2_affinity col b/w -1 and 1 so negative numbers
# stay neg and pos numbers stay positive
mcsm_ppi2_min = mcsm_ppi2_data['mcsm_ppi2_affinity'].min()
mcsm_ppi2_max = mcsm_ppi2_data['mcsm_ppi2_affinity'].max()
mcsm_ppi2_scale = lambda x : x/abs(mcsm_ppi2_min) if x < 0 else (x/mcsm_ppi2_max if x >= 0 else 'failed')
mcsm_ppi2_data['mcsm_ppi2_scaled'] = mcsm_ppi2_data['mcsm_ppi2_affinity'].apply(mcsm_ppi2_scale)
print('Raw mcsm_ppi2 scores:\n', mcsm_ppi2_data['mcsm_ppi2_affinity']
, '\n---------------------------------------------------------------'
, '\nScaled mcsm_ppi2 scores:\n', mcsm_ppi2_data['mcsm_ppi2_scaled'])
c = mcsm_ppi2_data[mcsm_ppi2_data['mcsm_ppi2_affinity']>=0].count()
mcsm_ppi2_pos = c.get(key = 'mcsm_ppi2_affinity')
c2 = mcsm_ppi2_data[mcsm_ppi2_data['mcsm_ppi2_scaled']>=0].count()
mcsm_ppi2_pos2 = c2.get(key = 'mcsm_ppi2_scaled')
if mcsm_ppi2_pos == mcsm_ppi2_pos2:
print('\nPASS: Affinity values scaled correctly')
else:
print('\nFAIL: Affinity values scaled numbers MISmatch'
, '\nExpected number:', mcsm_ppi2_pos
, '\nGot:', mcsm_ppi2_pos2
, '\n======================================================')
#%%=====================================================================
#############
# reorder columns
#############
mcsm_ppi2_data.columns
mcsm_ppi2_dataf = mcsm_ppi2_data[['mutationinformation'
, 'mcsm_ppi2_affinity'
, 'mcsm_ppi2_scaled'
, 'mcsm_ppi2_outcome'
, 'interface_dist'
, 'wild_type'
, 'position'
, 'mutant_type'
, 'wt_upper'
, 'mut_upper'
, 'chain']]
return(mcsm_ppi2_dataf)
#%%#####################################################################

View file

@ -0,0 +1,82 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 12 12:15:26 2021
@author: tanu
"""
#%% load packages
import sys, os
homedir = os.path.expanduser('~')
#sys.path.append(homedir + '/git/LSHTM_analysis/mcsm_ppi2')
from format_results_mcsm_ppi2 import *
########################################################################
#%% command line args
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug' , help = 'drug name (case sensitive)', default = None)
arg_parser.add_argument('-g', '--gene' , help = 'gene name (case sensitive)', default = None)
arg_parser.add_argument('--datadir' , help = 'Data Directory. By default, it assmumes homedir + git/Data')
arg_parser.add_argument('-i', '--input_dir' , help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
arg_parser.add_argument('--input_file' , help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
#arg_parser.add_argument('--mkdir_name' , help = 'Output dir for processed results. This will be created if it does not exist')
arg_parser.add_argument('-m', '--make_dirs' , help = 'Make dir for input and output', action='store_true')
arg_parser.add_argument('--debug' , action = 'store_true' , help = 'Debug Mode')
args = arg_parser.parse_args()
#%%============================================================================
# variable assignment: input and output paths & filenames
drug = args.drug
gene = args.gene
datadir = args.datadir
indir = args.input_dir
outdir = args.output_dir
infile_mcsm_ppi2 = args.input_file
#outdir_ppi2 = args.mkdir_name
make_dirs = args.make_dirs
#=======
# dirs
#=======
if not datadir:
datadir = homedir + '/git/Data/'
if not indir:
indir = datadir + drug + '/input/'
if not outdir:
outdir = datadir + drug + '/output/'
#if not mkdir_name:
# outdir_ppi2 = outdir + 'mcsm_ppi2/'
outdir_ppi2 = outdir + 'mcsm_ppi2/'
# Input file
if not infile_mcsm_ppi2:
infile_mcsm_ppi2 = outdir_ppi2 + gene.lower() + '_output_combined_clean.csv'
# Formatted output file
outfile_mcsm_ppi2_f = outdir_ppi2 + gene.lower() + '_complex_mcsm_ppi2_norm.csv'
#==========================
# CALL: format_results_mcsm_na()
# Data: gid+streptomycin
#==========================
print('Formatting results for:', infile_mcsm_ppi2)
mcsm_ppi2_df_f = format_mcsm_ppi2_output(mcsm_ppi2_output_csv = infile_mcsm_ppi2)
# writing file
print('Writing formatted df to csv')
mcsm_ppi2_df_f.to_csv(outfile_mcsm_ppi2_f, index = False)
print('Finished writing file:'
, '\nFile:', outfile_mcsm_ppi2_f
, '\nExpected no. of rows:', len(mcsm_ppi2_df_f)
, '\nExpected no. of cols:', len(mcsm_ppi2_df_f.columns)
, '\n=============================================================')
#%%#####################################################################

View file

@ -1,512 +0,0 @@
, stringsAsFactors = F)
x = as.numeric(grepl(i,raw_data$all_muts_pza))
# DV: pyrazinamide 0 or 1
y = as.numeric(raw_data$pyrazinamide)
table(y,x)
# run glm model
model = glm(y ~ x, family = binomial)
#model = glm(y ~ x, family = binomial(link = "logit"))
summary(model)
#**********
# extract relevant model output
#**********
# extract log OR i.e the Beta estimate of the logistic model for a given snp
my_logor = summary(model)$coefficients[2,1]
print(paste0('Beta:', my_logor))
# extract SE of the logistic model for a given snp
my_se = summary(model)$coefficients[2,2]
print(paste0('SE:', my_se))
# extract Z of the logistic model for a given snp
my_zval = summary(model)$coefficients[2,3]
print(paste0('Z-value:', my_zval))
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
my_or = exp(summary(model)$coefficients[2,1])
print(paste0('OR:', my_or))
# sanity check : should be True
log(my_or) == my_logor
# extract P-value of the logistic model for a given snp
my_pval = summary(model)$coefficients[2,4]
print(paste0('P-value:', my_pval))
# extract confint interval of snp (2 steps, since the output is a named number)
ci_mod = exp(confint(model))[2,]
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
print(paste0('CI:', my_ci))
#*************
# Assign the regression output in the original df
# you can use ('=' or '<-/->')
#*************
#pnca_snps_or$logistic_logOR[pnca_snps_or$Mutationinformation == i] = my_logor
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$Mutationinformation == i]
my_logor
pnca_snps_or$Mutationinformation == i
View(pnca_snps_or)
#===============
# Step 4: Calculate for one snp
# using i, when you run the loop, it is easy
#===============
i = "pnca_p.trp68gly"
pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
, stringsAsFactors = F
, header = T) #2133
# uncomment as necessary
pnca_snps_or = pnca_snps_or[1:5,]
pnca_snps_or = pnca_snps_or[c(1:5),]
#===============
pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
, stringsAsFactors = F
, header = T) #2133
pnca_snps_or = pnca_snps_or[1:5,]
pnca_snps_or = pnca_snps_or[c(1:5),]
pnca_snps_or = pnca_snps_or[1:5]
pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
, stringsAsFactors = F
, header = T) #2133
pnca_snps_or = pnca_snps_or[1:5]
pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
, stringsAsFactors = F
, header = T) #2133
foo = pnca_snps_or[c(1:5,)]
foo = pnca_snps_or[c(1:5),]
foo = as.data.frame(pnca_snps_or[c(1:5),])
View(foo)
# create an empty dataframe
pnca_snps_or = as.data.frame(pnca_snps_or[c(1:5),])
# IV: corresponds to each unique snp (extracted using grep)
x = as.numeric(grepl(i,raw_data$all_muts_pza))
# DV: pyrazinamide 0 or 1
y = as.numeric(raw_data$pyrazinamide)
table(y,x)
# run glm model
model = glm(y ~ x, family = binomial)
#model = glm(y ~ x, family = binomial(link = "logit"))
summary(model)
my_logor = summary(model)$coefficients[2,1]
print(paste0('Beta:', my_logor))
# extract SE of the logistic model for a given snp
my_se = summary(model)$coefficients[2,2]
print(paste0('SE:', my_se))
# extract Z of the logistic model for a given snp
my_zval = summary(model)$coefficients[2,3]
print(paste0('Z-value:', my_zval))
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
my_or = exp(summary(model)$coefficients[2,1])
print(paste0('OR:', my_or))
# sanity check : should be True
log(my_or) == my_logor
# extract P-value of the logistic model for a given snp
my_pval = summary(model)$coefficients[2,4]
print(paste0('P-value:', my_pval))
# extract confint interval of snp (2 steps, since the output is a named number)
ci_mod = exp(confint(model))[2,]
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
print(paste0('CI:', my_ci))
#*************
# Assign the regression output in the original df
# you can use ('=' or '<-/->')
#*************
#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
#===============
# Step 4: Iterate through this unique list
# and calculate OR, but only for one snp
# this is test before you apply it all others
#===============
pnca_snps_or$mutation == i
View(pnca_snps_or)
# create an empty dataframe
pnca_snps_or = data.frame(mutation = i)
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
View(pnca_snps_or_copy)
#===============
# Step 4: Iterate through this unique list
# and calculate OR, but only for one snp
# this is test before you apply it all others
#===============
#reset original df so you don't make a mistake
pnca_snps_or = pnca_snps_or_copy
for (i in pnca_snps_unique){
print(i)
}
pnca_snps_or = pnca_snps_or_copy #2133, 1
#........................................
# create an empty dataframe : uncomment as necessary
pnca_snps_or = data.frame(mutation = c(i, "blank_mut")
#........................................
# create an empty dataframe : uncomment as necessary
pnca_snps_or = data.frame(mutation = c(i, "blank_mut"))
#........................................
# create an empty dataframe : uncomment as necessary
pnca_snps_or = data.frame(mutation = c(i, "blank_mut"))
View(pnca_snps_or)
# IV: corresponds to each unique snp (extracted using grep)
x = as.numeric(grepl(i,raw_data$all_muts_pza))
# DV: pyrazinamide 0 or 1
y = as.numeric(raw_data$pyrazinamide)
table(y,x)
# run glm model
model = glm(y ~ x, family = binomial)
#model = glm(y ~ x, family = binomial(link = "logit"))
summary(model)
#**********
# extract relevant model output
#**********
# extract log OR i.e the Beta estimate of the logistic model for a given snp
my_logor = summary(model)$coefficients[2,1]
print(paste0('Beta:', my_logor))
# extract SE of the logistic model for a given snp
my_se = summary(model)$coefficients[2,2]
print(paste0('SE:', my_se))
# extract Z of the logistic model for a given snp
my_zval = summary(model)$coefficients[2,3]
print(paste0('Z-value:', my_zval))
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
my_or = exp(summary(model)$coefficients[2,1])
print(paste0('OR:', my_or))
# sanity check : should be True
log(my_or) == my_logor
# extract P-value of the logistic model for a given snp
my_pval = summary(model)$coefficients[2,4]
print(paste0('P-value:', my_pval))
# extract confint interval of snp (2 steps, since the output is a named number)
ci_mod = exp(confint(model))[2,]
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
print(paste0('CI:', my_ci))
#*************
# Assign the regression output in the original df
# you can use ('=' or '<-/->')
#*************
#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
View(pnca_snps_or)
pnca_snps_or = pnca_snps_or_copy #2133, 1
for (i in pnca_snps_unique){
print(i)
#*************
# start logistic regression model building
#*************
# set the IV and DV for the logistic regression model
# IV: corresponds to each unique snp (extracted using grep)
x = as.numeric(grepl(i,raw_data$all_muts_pza))
# DV: pyrazinamide 0 or 1
y = as.numeric(raw_data$pyrazinamide)
table(y,x)
# run glm model
model = glm(y ~ x, family = binomial)
#model = glm(y ~ x, family = binomial(link = "logit"))
summary(model)
#**********
# extract relevant model output
#**********
# extract log OR i.e the Beta estimate of the logistic model for a given snp
my_logor = summary(model)$coefficients[2,1]
print(paste0('Beta:', my_logor))
# extract SE of the logistic model for a given snp
my_se = summary(model)$coefficients[2,2]
print(paste0('SE:', my_se))
# extract Z of the logistic model for a given snp
my_zval = summary(model)$coefficients[2,3]
print(paste0('Z-value:', my_zval))
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
my_or = exp(summary(model)$coefficients[2,1])
print(paste0('OR:', my_or))
# sanity check : should be True
log(my_or) == my_logor
# extract P-value of the logistic model for a given snp
my_pval = summary(model)$coefficients[2,4]
print(paste0('P-value:', my_pval))
# extract confint interval of snp (2 steps, since the output is a named number)
ci_mod = exp(confint(model))[2,]
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
print(paste0('CI:', my_ci))
#*************
# Assign the regression output in the original df
# you can use ('=' or '<-/->')
#*************
#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
}
warnings()
View(pnca_snps_or)
View(pnca_snps_or_copy)
#sanity check
pnca_snps_or$mutation == i1
#sanity check
pnca_snps_or[pnca_snps_or$mutation == i1]
pnca_snps_or[pnca_snps_or$mutation == i2]
pnca_snps_or[pnca_snps_or$mutation == i2,]
pnca_snps_or1 = unique(pnca_snps_or)
write.csv(pnca_snps_or1, "../Data_original/valid_pnca_snps_with_OR.csv")
# you only need it for the unique mutations
pnca_snps_or = unique(pnca_snps_or) #2133, 1
for (i in pnca_snps_unique){
print(i)
#*************
# start logistic regression model building
#*************
# set the IV and DV for the logistic regression model
# IV: corresponds to each unique snp (extracted using grep)
x = as.numeric(grepl(i,raw_data$all_muts_pza))
# DV: pyrazinamide 0 or 1
y = as.numeric(raw_data$pyrazinamide)
table(y,x)
# run glm model
model = glm(y ~ x, family = binomial)
#model = glm(y ~ x, family = binomial(link = "logit"))
summary(model)
#**********
# extract relevant model output
#**********
# extract log OR i.e the Beta estimate of the logistic model for a given snp
my_logor = summary(model)$coefficients[2,1]
print(paste0('Beta:', my_logor))
# extract SE of the logistic model for a given snp
my_se = summary(model)$coefficients[2,2]
print(paste0('SE:', my_se))
# extract Z of the logistic model for a given snp
my_zval = summary(model)$coefficients[2,3]
print(paste0('Z-value:', my_zval))
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
my_or = exp(summary(model)$coefficients[2,1])
print(paste0('OR:', my_or))
# sanity check : should be True
log(my_or) == my_logor
# extract P-value of the logistic model for a given snp
my_pval = summary(model)$coefficients[2,4]
print(paste0('P-value:', my_pval))
# extract confint interval of snp (2 steps, since the output is a named number)
ci_mod = exp(confint(model))[2,]
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
print(paste0('CI:', my_ci))
#*************
# Assign the regression output in the original df
# you can use ('=' or '<-/->')
#*************
#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
}
View(pnca_snps_or)
2.290256e+01
1.561132e+06
3.242285e-04
#sanity check
pnca_snps_or[pnca_snps_or$mutation == i1]
pnca_snps_or[pnca_snps_or$mutation == i2,]
write.csv(pnca_snps_or1, "../Data_original/valid_pnca_snps_with_OR.csv")
my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
, stringsAsFactors = FALSE) #11374, 19
View(my_data)
# remove the first column
my_data = my_data[-1] #11374, 18
# check if first col is 'id': should be TRUE
colnames(my_data)[1] == 'id'
# sanity check
snps_all = unique(my_data$mutation)# 337
pnca_snps_or = snps_all
pnca_snps_or = as.data.frame(snps_all)
View(pnca_snps_or)
snps_all[-"true_wt"]
pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
View(pnca_snps_or)
snps_all = as.data.frame(snps_all)
View(snps_all)
#remove true_wt entry
w1 = which(rownames(snps_all) == "true_wt")
View(snps_all)
#remove true_wt entry
w1 = which(snps_all$snps_all == "true_wt")
rm(pnca_snps_or)
pnca_snps_or = snps_all[-w1]
pnca_snps_or = snps_all[,-w1]
pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
#remove true_wt entry
w1 = which(snps_all) == "true_wt"
pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
, stringsAsFactors = FALSE) #11374, 19
# remove the first column
my_data = my_data[-1] #11374, 18
# check if first col is 'id': should be TRUE
colnames(my_data)[1] == 'id'
# sanity check
snps_all = unique(my_data$mutation)# 337
snps_all = as.data.frame(snps_all)
snps_all[-c(1,1)]
pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
pnca_snps_or = as.data.frame(snps_all[, -c(1,1)])
#remove true_wt entry
#w1 = which(snps_all) == "true_wt"
pnca_snps_or = snps_all
pnca_snps_or = pnca_snps_or_copy
#remove true_wt entry
#w1 = which(snps_all) == "true_wt"
pnca_snps_or = snps_all
pnca_snps_or -> pnca_snps_or_copy
#===============
# Step 4: Iterate through this unique list
# and calculate OR for each snp
# and assign to the pnca_snps_or df that has
# each row as a unique snp
#===============
# reset original df so you don't make a mistake: IMPORTANT
pnca_snps_or = pnca_snps_or_copy #2133, 1
# you only need it for the unique mutations
pnca_snps_or = unique(pnca_snps_or) #337, 1
for (i in pnca_snps_unique){
print(i)
#*************
# start logistic regression model building
#*************
# set the IV and DV for the logistic regression model
# IV: corresponds to each unique snp (extracted using grep)
x = as.numeric(grepl(i,raw_data$all_muts_pza))
# DV: pyrazinamide 0 or 1
y = as.numeric(raw_data$pyrazinamide)
table(y,x)
# run glm model
model = glm(y ~ x, family = binomial)
#model = glm(y ~ x, family = binomial(link = "logit"))
summary(model)
#**********
# extract relevant model output
#**********
# extract log OR i.e the Beta estimate of the logistic model for a given snp
my_logor = summary(model)$coefficients[2,1]
print(paste0('Beta:', my_logor))
# extract SE of the logistic model for a given snp
my_se = summary(model)$coefficients[2,2]
print(paste0('SE:', my_se))
# extract Z of the logistic model for a given snp
my_zval = summary(model)$coefficients[2,3]
print(paste0('Z-value:', my_zval))
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
my_or = exp(summary(model)$coefficients[2,1])
print(paste0('OR:', my_or))
# sanity check : should be True
log(my_or) == my_logor
# extract P-value of the logistic model for a given snp
my_pval = summary(model)$coefficients[2,4]
print(paste0('P-value:', my_pval))
# extract confint interval of snp (2 steps, since the output is a named number)
ci_mod = exp(confint(model))[2,]
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
print(paste0('CI:', my_ci))
#*************
# Assign the regression output in the original df
# you can use ('=' or '<-/->')
#*************
#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
}
getwd()
#setwd("~/Documents/git/LSHTM_Y1_PNCA/meta_data_analysis") # work
setwd("~/git/LSHTM_Y1_PNCA/meta_data_analysis") # thinkpad
#setwd("/Users/tanu/git/LSHTM_Y1_PNCA/meta_data_analysis") # mac
getwd()
#===============
# Step 1: read raw data
#===============
raw_data<-read.csv("../Data_original/original_tanushree_data_v2.csv"
,stringsAsFactors = F)[,c("id","pyrazinamide","dr_mutations_pyrazinamide","other_mutations_pyrazinamide")]#19265, 4
raw_data<-raw_data[!is.na(raw_data$pyrazinamide),]#12511, 4
# combine the two mutation columns
raw_data$all_mutations_pyrazinamide<-paste(raw_data$dr_mutations_pyrazinamide, raw_data$other_mutations_pyrazinamide)#12511, 5
head(raw_data$all_mutations_pyrazinamide)
# create yet another column that contains all the mutations but in lower case
raw_data$all_muts_pza = tolower(raw_data$all_mutations_pyrazinamide) #12511, 6
table(grepl("pnca_p",raw_data$all_muts_pza))
#FALSE TRUE
#10603 1908
pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
, stringsAsFactors = F
, header = T) #2133
# subset a snall section to test
#pnca_snps_or_copy = pnca_snps_or
#pnca_snps_or = pnca_snps_or_copy
pnca_snps_unique = unique(pnca_snps_or$mutation) #293
i2 = "pnca_p.trp68gly" # Should exist
grep(i2, pnca_snps_unique)
my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
, stringsAsFactors = FALSE) #11374, 19
# remove the first column
my_data = my_data[-1] #11374, 18
# check if first col is 'id': should be TRUE
colnames(my_data)[1] == 'id'
# sanity check
head(my_data$mutation)
my_data = unique(my_data$mutation)
my_data[!duplicated(my_data$mutation)]
my_data_unique = my_data[!duplicated(my_data$mutation),]
my_data[!duplicated('mutation'),]
my_data_unique = my_data[!duplicated(my_data[,'mutation']),]
my_data_unique = my_data[!duplicated(my_data['mutation']),]
getwd()
setwd("/git/LSHTM_analysis/meta_data_analysis")
getwd()
getwd()
setwd("/git/github/LSHTM_analysis/meta_data_analysis")
getwd()
#===============
# Step 1: read GWAS raw data stored in Data_original/
#===============
infile = read.csv("../Data_original", file.choose(), stringsAsFactors = F))
c = file.choose()
c = file.choose(../Data_original)
c = read.csv(file.choose(), stringsAsFactors = F)
#===============
# Step 1: read GWAS raw data stored in Data_original/
#===============
infile = read.csv(file.choose(), stringsAsFactors = F))
c = read.csv(file.choose(), stringsAsFactors = F)
#===============
# Step 1: read GWAS raw data stored in Data_original/
#===============
infile = read.csv(file.choose(), stringsAsFactors = F)
#===============
# Step 1: read GWAS raw data stored in Data_original/
#===============
infile = read.csv(file.choose(), stringsAsFactors = F)
raw_data = infile[,c("id","pyrazinamide","dr_mutations_pyrazinamide","other_mutations_pyrazinamide")]
outdir = paste0("../mcsm_analysis",drug,"/Data/")
# define output variables
drug = 'pyrazinamide'
outdir = paste0("../mcsm_analysis",drug,"/Data/")
outdir = paste0("../mcsm_analysis/",drug,"/Data/")
outFile = "meta_data_with_AFandOR.csv"
output_filename = paste0(outdir, outFile)
output_filename

View file

@ -1,7 +0,0 @@
#!/usr/bin/python3
# Initialise a blank 'Data' directory and drug subdirs etc.
# TODO:
# - Read base dir from config file
# - Create eg: '~/git/Data/{original,processed}
# - Create eg: '~/git/Data/processed/' + drug (for each drug)
# - Create eg: '~/git/Data/output/' + drug + '{plots, structure}'

View file

@ -1,241 +0,0 @@
getwd()
setwd("/git/github/git/LSHTM_analysis/meta_data_analysis")
getwd()
#===============
# Step 1: read GWAS raw data stored in Data_original/
#===============
infile = read.csv(file.choose(), stringsAsFactors = F)
raw_data = infile[,c("id"
, "pyrazinamide"
, "dr_mutations_pyrazinamide"
, "other_mutations_pyrazinamide")]
#####
# 1a: exclude na
#####
raw_data = raw_data[!is.na(raw_data$pyrazinamide),]
total_samples = length(unique(raw_data$id))
print(total_samples)
# sanity check: should be true
is.numeric(total_samples)
#####
# 1b: combine the two mutation columns
#####
raw_data$all_mutations_pyrazinamide = paste(raw_data$dr_mutations_pyrazinamide
, raw_data$other_mutations_pyrazinamide)
head(raw_data$all_mutations_pyrazinamide)
#####
# 1c: create yet another column that contains all the mutations but in lower case
#####
raw_data$all_muts_pnca = tolower(raw_data$all_mutations_pyrazinamide)
# sanity checks
table(grepl("pnca_p",raw_data$all_muts_pnca))
# sanity check: should be TRUE
sum(table(grepl("pnca_p",raw_data$all_muts_pnca))) == total_samples
# set up variables: can be used for logistic regression as well
i = "pnca_p.ala134gly" # has a NA, should NOT exist
table(grepl(i,raw_data$all_muts_pnca))
i = "pnca_p.trp68gly"
table(grepl(i,raw_data$all_muts_pnca))
mut = grepl(i,raw_data$all_muts_pnca)
dst = raw_data$pyrazinamide
table(mut, dst)
#chisq.test(table(mut,dst))
#fisher.test(table(mut, dst))
#table(mut)
###### read list of muts to calculate OR for (fname3 from pnca_data_extraction.py)
pnca_snps_or = read.csv(file.choose()
, stringsAsFactors = F
, header = T)
# extract unique snps to iterate over for AF and OR calcs
# total no of unique snps
# AF and OR calculations
pnca_snps_unique = unique(pnca_snps_or$mutation)
# Define OR function
x = as.numeric(mut)
y = dst
or = function(x,y){
tab = as.matrix(table(x,y))
a = tab[2,2]
if (a==0){ a<-0.5}
b = tab[2,1]
if (b==0){ b<-0.5}
c = tab[1,2]
if (c==0){ c<-0.5}
d = tab[1,1]
if (d==0){ d<-0.5}
(a/b)/(c/d)
}
dst = raw_data$pyrazinamide
ors = sapply(pnca_snps_unique,function(m){
mut = grepl(m,raw_data$all_muts_pnca)
or(mut,dst)
})
ors
pvals = sapply(pnca_snps_unique,function(m){
mut = grepl(m,raw_data$all_muts_pnca)
fisher.test(mut,dst)$p.value
})
pvals
afs = sapply(pnca_snps_unique,function(m){
mut = grepl(m,raw_data$all_muts_pnca)
mean(mut)
})
afs
# check ..hmmm
afs['pnca_p.trp68gly']
afs['pnca_p.gln10pro']
afs['pnca_p.leu4ser']
#plot(density(log(ors)))
#plot(-log10(pvals))
#hist(log(ors)
# ,breaks = 100
# )
# subset df cols to add to the calc param df
pnca_snps_cols = pnca_snps_or[c('mutation_info', 'mutation', 'Mutationinformation')]
pnca_snps_cols = pnca_snps_cols[!duplicated(pnca_snps_cols$mutation),]
rownames(pnca_snps_cols) = pnca_snps_cols$mutation
head(rownames(pnca_snps_cols))
#snps_with_AF_and_OR
# combine
comb_AF_and_OR = data.frame(ors, pvals, afs)
head(rownames(comb_AF_and_OR))
# sanity checks: should be the same
dim(comb_AF_and_OR); dim(pnca_snps_cols)
table(rownames(comb_AF_and_OR)%in%rownames(pnca_snps_cols))
table(rownames(pnca_snps_cols)%in%rownames(comb_AF_and_OR))
# merge the above two df whose dim you checked
snps_with_AF_and_OR = merge(comb_AF_and_OR, pnca_snps_cols
, by = "row.names"
# , all.x = T
)
#rm(pnca_snps_cols, pnca_snps_or, raw_data)
#===============
# Step 3: Read data file where you will add the calculated OR
# Note: this is the big file with one-many relationship between snps and lineages
# i.e fname4 from 'pnca_extraction.py'
#===============
my_data = read.csv(file.choose()
, row.names = 1
, stringsAsFactors = FALSE)
head(my_data)
length(unique(my_data$id))
# check if first col is 'id': should be TRUE
colnames(my_data)[1] == 'id'
# sanity check
head(my_data$mutation)
# FILES TO MERGE:
# comb_AF_and_OR: file containing OR
# my_data = big meta data file
# linking column: mutation
head(my_data)
merged_df = merge(my_data # big file
, snps_with_AF_and_OR # small (afor file)
, by = "mutation"
, all.x = T) # because you want all the entries of the meta data
# sanity checks: should be True
# FIXME: I have checked this manually, but make it so it is a pass or a fail!
comb_AF_and_OR[rownames(comb_AF_and_OR) == "pnca_p.gln10pro",]$ors
merged_df[merged_df$Mutationinformation.x == "Q10P",]$ors
merged_df[merged_df$Mutationinformation.x == "Q10P",]
# sanity check: very important!
colnames(merged_df)
table(merged_df$mutation_info.x == merged_df$mutation_info.y)
#FIXME: what happened to other 7 and FALSE
table(merged_df$Mutationinformation.x == merged_df$Mutationinformation.y)
# problem
identical(merged_df$Mutationinformation.x, merged_df$Mutationinformation.y)
#merged_df[merged_df$Mutationinformation.x != merged_df$Mutationinformation.y,]
#throw away the y because that is a smaller df
d1 = which(colnames(merged_df) == "mutation_info.y") #21
d2 = which(colnames(merged_df) == "Mutationinformation.y") #22
merged_df2 = merged_df[-c(d1, d2)] #3093 20
colnames(merged_df2)
# rename cols
colnames(merged_df2)[colnames(merged_df2)== "mutation_info.x"] <- "mutation_info"
colnames(merged_df2)[colnames(merged_df2)== "Mutationinformation.x"] <- "Mutationinformation"
colnames(merged_df2)
# should be 0
sum(is.na(merged_df2$Mutationinformation))
# count na in each column
na_count = sapply(merged_df2, function(y) sum(length(which(is.na(y))))); na_count
# only some or and Af should be NA
#Row.names ors pvals afs
#81 81 81 81
colnames(merged_df2)[colnames(merged_df2)== "ors"] <- "OR"
colnames(merged_df2)[colnames(merged_df2)== "afs"] <- "AF"
colnames(merged_df2)[colnames(merged_df2)== "pvals"] <- "pvalue"
colnames(merged_df2)
# add log OR and neglog pvalue
merged_df2$logor = log(merged_df2$OR)
is.numeric(merged_df2$logor)
merged_df2$neglog10pvalue = -log10(merged_df2$pvalue)
is.numeric(merged_df2$neglog10pvalue)
# write file out
#write.csv(merged_df, "../Data/meta_data_with_AFandOR_JP_TT.csv")
# define output variables
drug = 'pyrazinamide'
out_dir = paste0("../mcsm_analysis/",drug,"/Data/")
outFile = "meta_data_with_AFandOR.csv"
output_filename = paste0(outdir, outFile)
write.csv(merged_df2, output_filename
, row.names = F)

View file

@ -1,626 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 6 12:56:03 2019
@author: tanu
"""
# FIXME: include error checking to enure you only
# concentrate on positions that have structural info?
#%% load libraries
###################
# load libraries
import os, sys
import pandas as pd
#import numpy as np
#from pandas.api.types import is_string_dtype
#from pandas.api.types import is_numeric_dtype
# to create dir
#my_dir = os.path.expanduser('~/some_dir')
#make sure mcsm_analysis/ exists
#or specify the output directory
#%%
#%%
#%%
#========================================================
# TASK: extract ALL pncA mutations from GWAS data
#========================================================
#%%
####################
# my working dir
os.getcwd()
homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
os.getcwd()
#%%
from reference_dict import my_aa_dict #CHECK DIR STRUC THERE!
#%%
#NOTE: Out_dir MUST exis
# User defined dir strpyrazinamide
drug = 'pyrazinamide'
gene = 'pnca'
out_dir = homedir + '/git/LSHTM_analysis/mcsm_analysis/'
# = out_dir + drug
data_dir = homedir + '/git/Data'
if not os.path.exists(data_dir):
print('Error!', data_dir, 'does not exist. Please ensure it exists and contains the appropriate raw data')
os.makedirs(data_dir)
die()
if not os.path.exists(out_dir):
print('Error!', out_dir, 'does not exist. Please create it')
exit()
#if not os.path.exists(work_dir):
# print('creating dir that does not exist', 'dir_name:', work_dir)
# os.makedirs(work_dir)
else:
print('Dir exists: Carrying on')
# now create sub dir structure within work_dir
# pyrazinamide/mcsm_analysis
# we need three dir
# Data
# Scripts
# Plotting
# Results
# Plots
# create a list of dir names
#dir_names = ['Data', 'Scripts', 'Results']
#for i in dir_names:
# this_dir = (work_dir + '/' + i)
# if not os.path.exists(this_dir):
# print('creating dir that does not exist:', this_dir)
# os.makedirs(this_dir)
#else:
# print('Dir exists: Carrying on')
# Create sub dirs
# 1)
# Scripts
# Plotting
#subdir_plotting = work_dir + '/Scripts/Plotting'
#if not os.path.exists(subdir_plotting):
# print('creating dir that does not exist:', subdir_plotting)
# os.makedirs(subdir_plotting)
#else:
# print('Dir exists: Carrying on')
# 2)
# Results
# Plots
#subdir_plots = work_dir + '/Results/Plots'
#if not os.path.exists(subdir_plots):
# print('creating dir that does not exist:', subdir_plots)
# os.makedirs(subdir_plots)
#else:
# print('Dir exists: Carrying on')
# clear varaibles
#del(dir_names, drug, i, subdir_plots, subdir_plotting)
#exit()
#%%
#==============================================================================
############
# STEP 1: Read file original_tanushree_data_v2.csv
############
data_file = data_dir + '/input/original/original_tanushree_data_v2.csv'
meta_data = pd.read_csv(data_file, sep = ',')
# column names
list(meta_data.columns)
# extract elevant columns to extract from meta data related to the pyrazinamide
meta_data = meta_data[['id'
,'country'
,'lineage'
,'sublineage'
,'drtype'
, 'pyrazinamide'
, 'dr_mutations_pyrazinamide'
, 'other_mutations_pyrazinamide'
]]
# checks
total_samples = meta_data['id'].nunique() # 19265
# counts NA per column
meta_data.isna().sum()
# glance
meta_data.head()
# equivalent of table in R
# pyrazinamide counts
meta_data.pyrazinamide.value_counts()
#%%
############
# STEP 2: extract entries containing selected genes:
# pyrazinamide: pnca_p.
# in the dr_mutations and other mutations"
# as we are interested in the mutations in the protein coding region only
# (corresponding to a structure)
# and drop the entries with NA
#############
meta_pza = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
meta_pza = meta_data.loc[meta_data.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
del(meta_pza)
##########################
# pyrazinamide: pnca_p.
##########################
meta_data_pnca = meta_data[['id'
,'country'
,'lineage'
,'sublineage'
,'drtype'
, 'pyrazinamide'
, 'dr_mutations_pyrazinamide'
, 'other_mutations_pyrazinamide'
]]
del(meta_data)
# sanity checks
# dr_mutations only
meta_pnca_dr = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
meta_pnca_dr['id'].nunique()
del(meta_pnca_dr)
# other mutations
meta_pnca_other = meta_data_pnca.loc[meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
meta_pnca_other['id'].nunique()
del(meta_pnca_other)
# Now extract "all" mutations
meta_pnca_all = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*') | meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*') ]
meta_pnca_all['id'].nunique()
pnca_samples = len(meta_pnca_all)
pnca_na = meta_pnca_all['pyrazinamide'].isna().sum()
comp_pnca_samples = pnca_samples - pnca_na
#=#=#=#=#=#=#
# COMMENT: use it later to check number of complete samples from LF data
#=#=#=#=#=#=#
# sanity checks
meta_pnca_all.dr_mutations_pyrazinamide.value_counts()
meta_pnca_all.other_mutations_pyrazinamide.value_counts()
# more sanity checks
# !CAUTION!: muts will change depending on your gene
# dr muts : insert
meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Gln10Pro')] #
meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Phe106Leu')] # empty
meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Val139Leu')]
meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists # rows
m = meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists # rows
# other_muts
meta_pnca_all.loc[meta_pnca_all.other_mutations_pyrazinamide.str.contains('pncA_p.Gln10Pro*')] # empty
meta_pnca_all.loc[meta_pnca_all.other_mutations_pyrazinamide.str.contains('pncA_p.Phe106Leu')]
#=#=#=#=#=#=#=#=#=#
# FIXME
# COMMENTS: both mutations columns are separated by ;
# CHECK if there are mutations that exist both in dr and other_muts!
# doesn't make any sense for the same mut to exist in both, I would have thought!
#=#=#=#=#=#=#=#=#=#
# remove not required variables
del(meta_data_pnca)
############
# STEP 3: split the columns of
# a) dr_mutation_... (;) as
# the column has snps related to multiple genes.
# useful links
# https://stackoverflow.com/questions/17116814/pandas-how-do-i-split-text-in-a-column-into-multiple-rows
# this one works beautifully
# https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows
############
# sanity check: counts NA per column afer subsetted df: i.e in meta_pza(with pncA_p. extracted mutations)
meta_pnca_all.isna().sum()
#=#=#=#=#=#=#=#=#=#
# COMMENT: no NA's in dr_mutations/other_mutations_columns
#=#=#=#=#=#=#=#=#=#
# define the split function
def tidy_split(df, column, sep='|', keep=False):
"""
Split the values of a column and expand so the new DataFrame has one split
value per row. Filters rows where the column is missing.
Params
------
df : pandas.DataFrame
dataframe with the column to split and expand
column : str
the column to split and expand
sep : str
the string used to split the column's values
keep : bool
whether to retain the presplit value as it's own row
Returns
-------
pandas.DataFrame
Returns a dataframe with the same columns as `df`.
"""
indexes = list()
new_values = list()
#df = df.dropna(subset=[column])#<<<<<<-----see this incase you need to uncomment based on use case
for i, presplit in enumerate(df[column].astype(str)):
values = presplit.split(sep)
if keep and len(values) > 1:
indexes.append(i)
new_values.append(presplit)
for value in values:
indexes.append(i)
new_values.append(value)
new_df = df.iloc[indexes, :].copy()
new_df[column] = new_values
return new_df
########
# 3a: call tidy_split() on 'dr_mutations_pyrazinamide' column and remove leading white spaces
#https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas
########
meta_pnca_WF0 = tidy_split(meta_pnca_all, 'dr_mutations_pyrazinamide', sep = ';')
# remove leading white space else these are counted as distinct mutations as well
meta_pnca_WF0['dr_mutations_pyrazinamide'] = meta_pnca_WF0['dr_mutations_pyrazinamide'].str.lstrip()
########
# 3b: call function on 'other_mutations_pyrazinamide' column and remove leading white spaces
########
meta_pnca_WF1 = tidy_split(meta_pnca_WF0, 'other_mutations_pyrazinamide', sep = ';')
# remove the leading white spaces in the column
meta_pnca_WF1['other_mutations_pyrazinamide'] = meta_pnca_WF1['other_mutations_pyrazinamide'].str.strip()
##########
# Step 4: Reshape data so that all mutations are in one column and the
# annotations for the mutation reflect the column name
# LINK: http://www.datasciencemadesimple.com/reshape-wide-long-pandas-python-melt-function/
# data frame “df” is passed to melt() function
# id_vars is the variable which need to be left unaltered
# var_name are the column names so we named it as 'mutation_info'
# value_name are its values so we named it as 'mutation'
##########
meta_pnca_WF1.columns
meta_pnca_LF0 = pd.melt(meta_pnca_WF1
, id_vars = ['id', 'country', 'lineage', 'sublineage', 'drtype', 'pyrazinamide']
, var_name = 'mutation_info'
, value_name = 'mutation')
# sanity check: should be true
if len(meta_pnca_LF0) == len(meta_pnca_WF1)*2:
print('sanity check passed: Long format df has the expected length')
else:
print("Sanity check failed: Debug please!")
###########
# Step 5: This is still dirty data. Filter LF data so that you only have
# mutations corresponding to pnca_p.
# this will be your list you run OR calcs
###########
meta_pnca_LF1 = meta_pnca_LF0[meta_pnca_LF0['mutation'].str.contains('pncA_p.*')]
# sanity checks
# unique samples
meta_pnca_LF1['id'].nunique()
if len(meta_pnca_all) == meta_pnca_LF1['id'].nunique():
print("Sanity check passed: No of samples with pncA mutations match")
else:
print("Sanity check failed: Debug please!")
# count if all the mutations are indeed in the protein coding region
# i.e begin with pnca_p
meta_pnca_LF1['mutation'].str.count('pncA_p.').sum() # 3093
# should be true.
# and check against the length of the df, which should match
if len(meta_pnca_LF1) == meta_pnca_LF1['mutation'].str.count('pncA_p.').sum():
print("Sanity check passed: Long format data containing pnca mutations indeed correspond to pncA_p region")
else:
print("Sanity check failed: Debug please!")
###########
# Step 6: Filter dataframe with "na" in the drug column
# This is because for OR, you can't use the snps that have the
# NA in the specified drug column
# it creates problems when performing calcs in R inside the loop
# so best to filter it here
###########
# NOT NEEDED FOR all snps, only for extracting valid OR snps
del (meta_pnca_WF0, meta_pnca_WF1, meta_pnca_LF0, meta_pnca_all)
###########
# Step 7: count unique pncA_p mutations (all and comp cases)
###########
meta_pnca_LF1['mutation'].nunique()
meta_pnca_LF1.groupby('mutation_info').nunique()
meta_pnca_LF1['id'].nunique()
meta_pnca_LF1['mutation'].nunique()
meta_pnca_LF1.groupby('id').nunique()
###########
# Step 8: convert all snps only (IN LOWERCASE)
# because my mcsm file intergated has lowercase
###########
# convert mutation to lower case as it needs to exactly match the dict key
#meta_pnca_LF1['mutation'] = meta_pnca_LF1.mutation.str.lower() # WARNINGS: suggested to use .loc
meta_pnca_LF1['mutation'] = meta_pnca_LF1.loc[:, 'mutation'].str.lower()
###########
# Step 9 : Split 'mutation' column into three: wild_type, position and
# mutant_type separately. Then map three letter code to one from the
# referece_dict imported pncaeady. First convert to mutation to lowercase
# to allow to match entries from dict
###########
#=======
# Step 9a: iterate through the dict, create a lookup dict i.e
# lookup_dict = {three_letter_code: one_letter_code}.
# lookup dict should be the key and the value (you want to create a column for)
# Then use this to perform the mapping separetly for wild type and mutant cols.
# The three letter code is extracted using a regex match from the dataframe and then converted
# to 'pandas series'since map only works in pandas series
#=======
# initialise a sub dict that is a lookup dict for three letter code to one
lookup_dict = dict()
for k, v in my_aa_dict.items():
lookup_dict[k] = v['one_letter_code']
wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
meta_pnca_LF1['wild_type'] = wt.map(lookup_dict)
mut = meta_pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
meta_pnca_LF1['mutant_type'] = mut.map(lookup_dict)
# extract position info from mutation column separetly using regex
meta_pnca_LF1['position'] = meta_pnca_LF1['mutation'].str.extract(r'(\d+)')
# clear variables
del(k, v, wt, mut, lookup_dict)
#=========
# Step 9b: iterate through the dict, create a lookup dict that i.e
# lookup_dict = {three_letter_code: aa_prop_water}
# Do this for both wild_type and mutant as above.
#=========
# initialise a sub dict that is lookup dict for three letter code to aa prop
lookup_dict = dict()
for k, v in my_aa_dict.items():
lookup_dict[k] = v['aa_prop_water']
#print(lookup_dict)
wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
meta_pnca_LF1['wt_prop_water'] = wt.map(lookup_dict)
mut = meta_pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
meta_pnca_LF1['mut_prop_water'] = mut.map(lookup_dict)
# added two more cols
# clear variables
del(k, v, wt, mut, lookup_dict)
#========
# Step 9c: iterate through the dict, create a lookup dict that i.e
# lookup_dict = {three_letter_code: aa_prop_polarity}
# Do this for both wild_type and mutant as above.
#=========
# initialise a sub dict that is lookup dict for three letter code to aa prop
lookup_dict = dict()
for k, v in my_aa_dict.items():
lookup_dict[k] = v['aa_prop_polarity']
#print(lookup_dict)
wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
meta_pnca_LF1['wt_prop_polarity'] = wt.map(lookup_dict)
mut = meta_pnca_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
meta_pnca_LF1['mut_prop_polarity'] = mut.map(lookup_dict)
# added two more cols
# clear variables
del(k, v, wt, mut, lookup_dict)
########
# Step 10: combine the wild_type+poistion+mutant_type columns to generate
# Mutationinformation (matches mCSM output field)
# Remember to use .map(str) for int col types to allow string concatenation
#########
meta_pnca_LF1['Mutationinformation'] = meta_pnca_LF1['wild_type'] + meta_pnca_LF1.position.map(str) + meta_pnca_LF1['mutant_type']
#=#=#=#=#=#=#
# Step 11:
# COMMENT: there is more processing in the older version of this script
# consult if necessary
# possibly due to the presence of true_wt
# since this file doesn't contain any true_wt, we won't need it(hopefully!)
#=#=#=#=#=#=#
#%%
###########
# Step 12: Output files for only SNPs to run mCSM
###########
#=========
# Step 12a: all SNPs to run mCSM
#=========
snps_only = pd.DataFrame(meta_pnca_LF1['Mutationinformation'].unique())
pos_only = pd.DataFrame(meta_pnca_LF1['position'].unique())
# assign meaningful colnames
#snps_only.rename({0 : 'all_pnca_snps'}, axis = 1, inplace = True)
#list(snps_only.columns)
snps_only.isna().sum() # should be 0
# output csv: all SNPS for mCSM analysis
# specify variable name for output file
gene = 'pnca'
#drug = 'pyrazinamide'
my_fname1 = '_snps_'
nrows = len(snps_only)
#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
#output_file_path = work_dir + '/Data/'
output_file_path = data_dir + '/input/processed/' + drug + '/'
if not os.path.exists(output_file_path):
print( output_file_path, 'does not exist. Creating')
os.makedirs(output_file_path)
exit()
output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv'
print(output_filename) #<<<- check
# write to csv: without column or row names
# Bad practice: numbers at the start of a filename
snps_only.to_csv(output_filename, header = False, index = False)
#=========
# Step 12b: all snps with annotation
#=========
# all snps, selected cols
#pnca_snps_ALL = meta_pnca_LF1[['id','country','lineage', 'sublineage'
# , 'drtype', 'pyrazinamide'
# , 'mutation_info', 'mutation', 'Mutationinformation']]
#len(pnca_snps_ALL)
# sanity check
#meta_pnca_LF1['mutation'].nunique()
# output csv: WITH column but WITHOUT row names(all snps with meta data)
# specify variable name for output file
#gene = 'pnca'
#drug = 'pyrazinamide'
#my_fname2 = '_snps_with_metadata_'
#nrows = len(pnca_snps_ALL)
#output_file_path = work_dir + '/Data/'
#output_filename = output_file_path + gene + my_fname2 + str(nrows) + '.csv'
#print(output_filename) #<<<- check
# write out file
#pnca_snps_ALL.to_csv(output_filename, header = True, index = False)
#=========
# Step 12c: comp snps for OR calcs with annotation
#=========
# remove all NA's from pyrazinamide column from LF1
# counts NA per column
meta_pnca_LF1.isna().sum()
# remove NA
meta_pnca_LF2 = meta_pnca_LF1.dropna(subset=['pyrazinamide'])
# sanity checks
# should be True
len(meta_pnca_LF2) == len(meta_pnca_LF1) - meta_pnca_LF1['pyrazinamide'].isna().sum()
# unique counts
meta_pnca_LF2['mutation'].nunique()
meta_pnca_LF2.groupby('mutation_info').nunique()
# sanity check
meta_pnca_LF2['id'].nunique()
# should be True
if meta_pnca_LF2['id'].nunique() == comp_pnca_samples:
print ('sanity check passed: complete numbers match')
else:
print('Error: Please Debug!')
# value counts
meta_pnca_LF2.mutation.value_counts()
#meta_pnca_LF2.groupby(['mutation_info', 'mutation']).size()
# valid/comp snps
# uncomment as necessary
pnca_snps_COMP = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique())
len(pnca_snps_COMP)
# output csv: WITH column but WITHOUT row names (COMP snps with meta data)
# specify variable name for output file
gene = 'pnca'
#drug = 'pyrazinamide'
my_fname3 = '_comp_snps_with_metadata_'
nrows = len(pnca_snps_COMP)
#output_filename = output_file_path + gene + my_fname3 + str(nrows) + '.csv'
#print(output_filename) #<<<-check
# write out file
#pnca_snps_COMP.to_csv(output_filename, header = True, index = False)
#=========
# Step 12d: comp snps only
#=========
# output csv: comp SNPS for info (i.e snps for which OR exist)
# specify variable name for output file
snps_only = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique())
gene = 'pnca'
#drug = 'pyrazinamide'
my_fname1 = '_comp_snps_'
nrows = len(snps_only)
output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv'
print(output_filename) #<<<- check
# write to csv: without column or row names
snps_only.to_csv(output_filename, header = False, index = False)
#=#=#=#=#=#=#=#
# COMMENT: LF1 is the file to extract all unique snps for mcsm
# but you have that already in file called pnca_snps...
# LF2: is the file for extracting snps tested for DS and hence OR calcs
#=#=#=#=#=#=#=#
###########
# Step 13 : Output the whole df i.e
# file for meta_data which is now formatted with
# each row as a unique snp rather than the original version where
# each row is a unique id
# ***** This is the file you will ADD the AF and OR calculations to *****
###########
# output csv: the entire DF
# specify variable name for output file
gene = 'pnca'
#drug = 'pyrazinamide'
my_fname4 = '_metadata'
#nrows = len(meta_pnca_LF1)
output_filename = output_file_path + gene + my_fname4 + '.csv'
print(output_filename) #<<<-check
# write out file
meta_pnca_LF1.to_csv(output_filename)

View file

@ -1,121 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 18 11:32:28 2019
@author: tanushree
"""
############################################
#load libraries
import pandas as pd
import os
#############################################
#!#########################!
# REQUIREMNETS:
# Data_original/ must exist
# containing GWAS data
#!#########################!
print(os.getcwd())
homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
os.chdir(homedir + '/git/Data/input/original')
print(os.getcwd())
#==========
#read file
#==========
my_aa = pd.read_csv('aa_codes.csv') #20, 6
#assign the one_letter code as the row names so that it is easier to create a dict of dicts using index
#my_aa = pd.read_csv('aa_codes.csv', index_col = 0) #20, 6 #a way to it since it is the first column
my_aa = my_aa.set_index('three_letter_code_lower') #20, 5
#=========================================================
#convert file to dict of dicts
#=========================================================
#convert each row into a dict of dicts so that there are 20 aa and 5 keys within
#with your choice of column name that you have assigned to index as the "primary key".
#using 'index' creates a dict of dicts
#using 'records' creates a list of dicts
my_aa_dict = my_aa.to_dict('index') #20, with 5 subkeys
#================================================
#dict of aa with their corresponding properties
#This is defined twice
#================================================
#7 categories: no overlap
qualities1 = { ('R', 'H', 'K'): 'Basic'
, ('D', 'E'): 'Acidic'
, ('N', 'Q'): 'Amidic'
, ('G', 'A', 'V', 'L', 'I', 'P'): 'Hydrophobic'
, ('S', 'T'): 'Hydroxylic'
, ('F', 'W', 'Y'): 'Aromatic'
, ('C', 'M'): 'Sulphur'
}
#9 categories: allowing for overlap
qualities2 = { ('R', 'H', 'K'): 'Basic'
, ('D', 'E'): 'Acidc'
, ('S', 'T', 'N', 'Q'): 'Polar'
, ('V', 'I', 'L', 'M', 'F', 'Y', 'W'): 'Hydrophobic'
, ('S', 'T', 'H', 'N', 'Q', 'E', 'D', 'K', 'R'): 'Hydrophilic'
, ('S', 'G', 'A', 'P'): 'Small'
, ('F', 'W', 'Y', 'H'): 'Aromatic'
, ('V', 'I', 'L', 'M'): 'Aliphatic'
, ('C', 'G', 'P'): 'Special'
}
qualities_taylor = { ('R', 'H', 'K'): 'Basic'
, ('D', 'E'): 'Acidc'
, ('S', 'T', 'N', 'Q', 'C', 'Y', 'W', 'H', 'K', 'R', 'D', 'E'): 'Polar'
, ('V', 'I', 'L', 'M', 'F', 'Y', 'W', 'C', 'A', 'G', 'T', 'H'): 'Hydrophobic'
#, ('S', 'T', 'H', 'N', 'Q', 'E', 'D', 'K', 'R'): 'Hydrophilic', #C, W, y MISSING FROM POLAR!
, ('S', 'G', 'A', 'P', 'C', 'T', 'N', 'D', 'V'): 'Small'
, ('F', 'W', 'Y', 'H'): 'Aromatic'
, ('V', 'I', 'L', 'M'): 'Aliphatic' #although M is not strictly in the circle!
, ('C', 'G', 'P'): 'Special'
}
qualities_water = { ('D', 'E', 'N', 'P', 'Q', 'R', 'S'): 'hydrophilic'
, ('A', 'C', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'T', 'V', 'W', 'X', 'Y'): 'hydrophobic'
}
qualities_polarity = { ('D', 'E'): 'acidic'
, ('H', 'K', 'R'): 'basic'
, ('C', 'G', 'N', 'Q', 'S', 'T', 'Y'): 'neutral'
, ('A', 'F', 'I', 'L', 'M', 'P', 'V', 'W'): 'non-polar'
}
#==============================================================================
#adding amino acid properties to my dict of dicts
for k, v in my_aa_dict.items():
#print (k,v)
v['aa_prop1'] = str() #initialise keys
v['aa_prop2'] = list() #initialise keys (allows for overalpping properties)
v['aa_taylor'] = list() #initialise keys (allows for overalpping properties)
v['aa_prop_water'] = str() #initialise keys
v['aa_prop_polarity'] = str() #initialise keys
for group in qualities1:
if v['one_letter_code'] in group:
v['aa_prop1']+= qualities1[group] # += for str concat
for group in qualities2:
if v['one_letter_code'] in group:
v['aa_prop2'].append(qualities2[group]) # append to list
for group in qualities_taylor:
if v['one_letter_code'] in group:
v['aa_taylor'].append(qualities_taylor[group]) # append to list
for group in qualities_water:
if v['one_letter_code'] in group:
v['aa_prop_water']+= qualities_water[group] # += for str concat
for group in qualities_polarity:
if v['one_letter_code'] in group:
v['aa_prop_polarity']+= qualities_polarity[group] # += for str concat
#COMMENT:VOILA!!! my_aa_dict is now a dict of dicts containing all associated properties for each aa
#==============================================================================

View file

@ -4,9 +4,6 @@
## Structure:
#
# $DATA_DIR/$DRUG/input
# |- original
# |- processed
# |- structure
#
# $DATA_DIR/$DRUG/output
# |- plots
@ -15,18 +12,17 @@
DATA_DIR=~/git/Data
if [[ $1 == '' ]]; then
echo "Error"
echo "usage: mk-drug-dirs.sh <drug name>";
exit;
else
DRUG=$1
echo Creating structure for: $DRUG
echo Creating directory structure for: $DRUG
if [ -d $DATA_DIR ]
then
echo Doing creation in $DATA_DIR
mkdir -p $DATA_DIR/$DRUG/input/original
mkdir -p $DATA_DIR/$DRUG/input/processed
mkdir -p $DATA_DIR/$DRUG/input/structure
mkdir -p $DATA_DIR/$DRUG/input
mkdir -p $DATA_DIR/$DRUG/output/plots
mkdir -p $DATA_DIR/$DRUG/output/structure

View file

@ -1,25 +1,36 @@
#########################################################
### A) Installing and loading required packages
#########################################################
#lib_loc = "/usr/local/lib/R/site-library")
#if (!require("gplots")) {
# install.packages("gplots", dependencies = TRUE)
# library(gplots)
#}
if (!require("tidyverse")) {
install.packages("tidyverse", dependencies = TRUE)
library(tidyverse)
}
#if (!require("tidyverse")) {
# install.packages("tidyverse", dependencies = TRUE)
# library(tidyverse)
#}
if (!require("ggplot2")) {
install.packages("ggplot2", dependencies = TRUE)
library(ggplot2)
}
if (!require("ggridges")) {
install.packages("ggridges", dependencies = TRUE)
library(ggridges)
}
if (!require("plotly")) {
install.packages("plotly", dependencies = TRUE)
library(plotly)
}
if (!require("cowplot")) {
install.packages("copwplot", dependencies = TRUE)
library(ggplot2)
library(cowplot)
}
if (!require("ggcorrplot")) {
@ -43,37 +54,33 @@ if (!require ("GOplot")) {
}
if(!require("VennDiagram")) {
install.packages("VennDiagram", dependencies = T)
library(VennDiagram)
}
if(!require("scales")) {
install.packages("scales", dependencies = T)
library(scales)
}
if(!require("plotrix")) {
install.packages("plotrix", dependencies = T)
library(plotrix)
}
if(!require("stats")) {
install.packages("stats", dependencies = T)
library(stats)
}
if(!require("stats4")) {
install.packages("stats4", dependencies = T)
library(stats4)
}
if(!require("data.table")) {
library(stats4)
install.packages("data.table")
library(data.table)
}
if (!require("PerformanceAnalytics")){
@ -98,18 +105,17 @@ if (!require ("psych")){
if (!require ("dplyr")){
install.packages("dplyr")
library(psych)
library(dplyr)
}
if (!require ("compare")){
install.packages("compare")
library(psych)
library(compare)
}
if (!require ("arsenal")){
install.packages("arsenal")
library(psych)
library(arsenal)
}
@ -118,7 +124,7 @@ if (!require ("arsenal")){
#if(!require(devtools)) install.packages("devtools")
#devtools::install_github("kassambara/ggcorrplot")
library(ggcorrplot)
#library(ggcorrplot)
###for PDB files

13
scripts/DOCS Normal file
View file

@ -0,0 +1,13 @@
dir structure
~/git/Data
aa_codes.csv
~/git/Data/<drug>/input
~/git/Data/<drug>/output
data_extraction.py
must have the dirs else creates it
in the curr dir needs
reference_dict.py
tidy_split.py

178
scripts/aa_code.py Normal file
View file

@ -0,0 +1,178 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''
Created on Mon June 14 2021
@author: tanu
'''
# FIXME: import dirs.py to get the basic dir paths available
#=======================================================================
# TASK
# Input:
# Output:
#=======================================================================
#%% load libraries
import os, sys
import pandas as pd
import re
#import numpy as np
import argparse
DEBUG = False
#=======================================================================
#%% specify input and curr dir
homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
os.getcwd()
from reference_dict import oneletter_aa_dict
from reference_dict import low_3letter_dict
#=======================================================================
#%%###########################################################################
# FUNCTION: using mcsm mutation format to split mutation info into
# 2 separate columns for wt 3 letter lowecase and mut 3 letter lowercase
###############################################################################
def get_aa_3lower(df, wt_colname = 'wild_type', mut_colname = 'mutant_type', col_wt = 'wt_aa_3lower', col_mut = 'mut_aa_3lower'):
""" Add 3 letter lowercase aa code for wt and mutant residues specified as 1 letter uppercase aa code
@df: df containing one letter aa code for wt and mutant respectively
@type: pandas df
@wt_colname: column containing one letter wild type aa
@type: str
@mut_colname: column containing one letter mutant type aa
@type: str
@col_wt: column with 3 letter aa code lower for wild type aa
@type: str
@col_mut: column with 3 letter aa code lower for mutant type aa
@type: str
returns df: with 2 added columns. If column names clash, the function column
name will override original column
@rtype: pandas df
"""
lookup_dict_aa_3lower = dict()
for k, v in oneletter_aa_dict.items():
lookup_dict_aa_3lower[k] = v['three_letter_code_lower']
#if DEBUG:
# print('Key:', k
# , 'Value:', v
# , '\n=====================================================\n'
# , '\nDICT:', lookup_dict_aa_3lower :\n')
df[col_wt] = df[wt_colname].map(lookup_dict_aa_3lower)
df[col_mut] = df[mut_colname].map(lookup_dict_aa_3lower)
return df
#%%
#==================================
# example: get_aa_3upper()
#==================================
# test_filename = '/home/tanu/git/Data/streptomycin/output/gid_complex_mcsm_norm_SAM.csv'
# test_df = pd.read_csv(test_filename , sep = ',')
# my_wt_colname = 'wild_type'
# my_mut_colname = 'mutant_type'
# my_col1 = 'wt_aa_3lower'
# my_col2 = 'mut_aa_3lower'
# get_aa_3lower(df = test_df
# , wt_colname = my_wt_colname
# , mut_colname = my_mut_colname
# , col_wt = my_col1
# , col_mut = my_col2)
#%%###########################################################################
# FUNCTION: using gwas mutation format to split mutation info into
# 3 separate columns for wild type, position and mutation
###############################################################################
def get_aa_1upper(df
, gwas_mut_colname = 'mutation'
, wt_colname = 'wt_aa_1upper'
, pos_colname = 'position'
, mut_colname = 'mut_aa_1upper'):
"""Add 1 letter aa uppercase aa code for wt and mutant residues specified as 3 letter lowercase aa code
@df: df containing one letter aa code for wt and mutant respectively
@type: pandas df
@wt_regex: regex string matching three letter lowercase aa code
@type:regex
@pos_regex: regex string matching aa position
@type:regex
@mut_regex: regex string matching three letter lowercase aa code
@type: regex
@wt_colname: column containing one letter wild type aa
@type: str
@mut_colname: column containing one letter mutant type aa
@type: str
@wt_colname: column with 3 letter aa code lower for wild type aa
@type: str
@pos_colname: column with aa position
@type: int
@mut_colname: column with 3 letter aa code lower for mutant type aa
@type: str
returns df: with 3 added columns. If column names clash, the function column
name will override original column
@rtype: pandas df
"""
# static regex
gwas_regex = r'^.*_p\.([A-Za-z]{3})([0-9]+)([A-Za-z]{3})$'
gwas_wt = df[gwas_mut_colname].str.extract(gwas_regex)[0]
gwas_pos = df[gwas_mut_colname].str.extract(gwas_regex)[1]
gwas_mut = df[gwas_mut_colname].str.extract(gwas_regex)[2]
lookup_dict_aa_1upper = dict()
for k, v in low_3letter_dict.items():
lookup_dict_aa_1upper[k] = v['one_letter_code']
#if DEBUG:
# print('Key:', k
# , 'Value:', v
# , '\n======================================================\n'
# , '\nDICT:', lookup_dict_aa_1upper :\n')
# wild type
df[wt_colname] = gwas_wt.map(lookup_dict_aa_1upper)
# position
df[pos_colname] = gwas_pos
# mutant type
df[mut_colname] = gwas_mut.map(lookup_dict_aa_1upper)
return df
#%%
#==================================
# example: get_aa_1upper()
#==================================
# test_filename2 = '/home/tanu/git/Data/streptomycin/output/gid_af_or.csv'
# test_df2 = pd.read_csv(test_filename2 , sep = ',')
# get_aa_1upper(df = test_df2
# , gwas_mut_colname = 'mutation'
# , wt_colname = 'wild_type'
# , pos_colname = 'position'
# , mut_colname = 'mutant_type')

View file

@ -0,0 +1,85 @@
library(bio3d)
library(seqinr)
library(bios2mds)
library(protr)
#############################################################
#%% TASK
# use this to return df for AA index and mutation properties
source()
##############################################################
my_fasta_file = "~/git/Data/streptomycin/input/gid_complex.fasta"
my_mcsmf_snps = "~/git/Data/streptomycin/output/gid_mcsm_formatted_snps.csv"
###############################################################
#%% fasta as vector
gid_aa_seq_v= read.fasta(my_fasta_file
, seqtype = "AA"
, as.string = F)
gid_aa_v = as.character(gid_aa_seq_v[[1]]); gid_aa_v
#%% fasta as string
gid_aa_seq_s = read.fasta(my_fasta_file
, seqtype = "AA"
, as.string = T)
gid_aa_s = as.character(gid_aa_seq_s[[1]]); gid_aa_s
###############################################################
#===================
# AA indices
# https://www.genome.jp/aaindex/AAindex/list_of_indices
#===================
data(aa.index)
# default
aai_kd = aa2index(gid_aa_v, index = "KYTJ820101") # Hydropathy, KD
aai_rv = aa2index(gid_aa_v, index = "BIGC670101") # Residue volume, Bigelow, 1967
aai_rv2 = aa2index(gid_aa_v, index = "GOLD730102") # Residue volume (Goldsack-Chalifoux, 1973)
aai_b = aa2index(gid_aa_v, index = "VENT840101") # Bitterness (Venanzi, 1984)
par(mfrow = c(1,1))
barplot(aai_kd)
barplot(aai_rv)
barplot(aai_rv2)
#barplot(aai_b, col = c("black", "yellow"))
##########################################################
#===================
# mutation matrices
#===================
data(sub.mat)
snps = read.csv(my_mcsmf_snps
, header = 0)
snps
colnames(snps) <- "mutationinformation"
# run using all matrices
sub_mat_names = as.character(unlist(attributes(sub.mat)))
#sub_mat_names = "BLOSUM80"
for (j in sub_mat_names){
print(j)
snps[[j]] <- NA
for (i in 1:nrow(snps)) {
curr_snp = snps$mutationinformation[i]
m1 = str_match(curr_snp, "^([A-Z]{1})[0-9]*([A-Z]{1})")
aa1 = m1[,2]
aa2 = m1[,3]
#snps$blosum_80[i]
snps[[j]][i] = sub.mat[[j]][aa1,aa2]
}
}
snps
##########################################################
gid_aac = extractAAC(gid_aa_s)
gid_dc = extractDC(gid_aa_s)
gid_tc = extractTC(gid_aa_s)
par(mfrow = c(1, 3))
barplot(gid_aac)
barplot(gid_dc)
barplot(gid_tc)
###########################################################

Some files were not shown because too many files have changed in this diff Show more