From f94eadf1d4f9d728aea2136b81e831e5a7d06442 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 1 Sep 2022 12:54:41 +0100 Subject: [PATCH] added --- scripts/count_vars_ML.R | 4 +-- .../{GetMLData_v2.py => GetMLData_v1.py} | 35 +++++++++++++------ scripts/ml/ml_functions/TEST_GetMLData.py | 14 ++++++++ 3 files changed, 40 insertions(+), 13 deletions(-) rename scripts/ml/ml_functions/{GetMLData_v2.py => GetMLData_v1.py} (95%) create mode 100644 scripts/ml/ml_functions/TEST_GetMLData.py diff --git a/scripts/count_vars_ML.R b/scripts/count_vars_ML.R index 059cbc6..464fd6d 100644 --- a/scripts/count_vars_ML.R +++ b/scripts/count_vars_ML.R @@ -1,11 +1,11 @@ # count numbers for ML -#source("~/git/LSHTM_analysis/config/alr.R") +source("~/git/LSHTM_analysis/config/alr.R") #source("~/git/LSHTM_analysis/config/embb.R") #source("~/git/LSHTM_analysis/config/gid.R") #source("~/git/LSHTM_analysis/config/katg.R") #source("~/git/LSHTM_analysis/config/pnca.R") -source("~/git/LSHTM_analysis/config/rpob.R") +#source("~/git/LSHTM_analysis/config/rpob.R") ############################# # GET the actual merged dfs diff --git a/scripts/ml/ml_functions/GetMLData_v2.py b/scripts/ml/ml_functions/GetMLData_v1.py similarity index 95% rename from scripts/ml/ml_functions/GetMLData_v2.py rename to scripts/ml/ml_functions/GetMLData_v1.py index 180d586..d5eab71 100755 --- a/scripts/ml/ml_functions/GetMLData_v2.py +++ b/scripts/ml/ml_functions/GetMLData_v1.py @@ -450,15 +450,15 @@ def getmldata(gene, drug if gene.lower() in geneL_na: - gene_affinity_colnames = ['mcsm_na_affinity'] + gene_affinity_colnames = ['mcsm_na_affinity', 'nca_distance'] #X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols cols_to_mask = ['ligand_affinity_change']#, 'mcsm_na_affinity'] cols_to_mask_ppi2 = [] cols_to_mask_na = ['mcsm_na_affinity'] - + # both if gene.lower() in geneL_na_ppi2: - gene_affinity_colnames = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] + gene_affinity_colnames = ['mcsm_na_affinity','nca_distance', 'mcsm_ppi2_affinity', 'interface_dist'] #X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols #cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity'] cols_to_mask = ['ligand_affinity_change']#, 'mcsm_na_affinity'] @@ -481,27 +481,40 @@ def getmldata(gene, drug # mask the mcsm ligand affinity AND mcsm_na affinity columns where ligand distance > 10 my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0 + #mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask] + mask_check_cols = ['mutationinformation', 'ligand_distance'] + cols_to_mask + #--------------------------- # mask the mcsm_ppi2_affinity column where interface_dist > 10 #--------------------------- if len(cols_to_mask_ppi2) > 0: my_df_ml.loc[(my_df_ml['interface_dist'] > 10), cols_to_mask_ppi2] = 0 add_cols_mask = ['interface_dist'] + cols_to_mask_ppi2 - mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask + add_cols_mask] - else: - mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask ] - + #mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask + add_cols_mask] + mask_check_cols = mask_check_cols + add_cols_mask + #--------------------------- # mask the na_affinity column where nca_distance > 10 #--------------------------- if len(cols_to_mask_na) > 0: my_df_ml.loc[(my_df_ml['nca_distance'] > 10), cols_to_mask_na] = 0 add_cols_mask = ['nca_distance'] + cols_to_mask_na - mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask + add_cols_mask] - else: - mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask ] - + #mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask + add_cols_mask] + mask_check_cols = mask_check_cols + add_cols_mask + + # if gene.lower() in geneL_na_ppi2: + # #--------------------------- + # # RPOB: mask ppi2 + na + lig cols + # #--------------------------- + # mask_check = my_df_ml[['mutationinformation', + # 'ligand_distance', 'ligand_affinity_change', + # 'nca_distance','mcsm_na_affinity', + # 'mcsm_ppi2_affinity','interface_dist']] + + # GET mask data + mask_check = my_df_ml[mask_check_cols] + # sanity check: check script SANITY_CHECK_mask.py if write_maskfile: # write mask file for sanity check diff --git a/scripts/ml/ml_functions/TEST_GetMLData.py b/scripts/ml/ml_functions/TEST_GetMLData.py new file mode 100644 index 0000000..8bf23c5 --- /dev/null +++ b/scripts/ml/ml_functions/TEST_GetMLData.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Sep 1 12:22:27 2022 + +@author: tanu +""" +getmldata(gene = "katG" + , drug = "isoniazid" + , data_combined_model = False + , use_or = False + , omit_all_genomic_features = False + , write_maskfile = True + , write_outfile = False) \ No newline at end of file