diff --git a/scripts/ml/ml_functions/GetMLData.py b/scripts/ml/ml_functions/GetMLData.py index 3410389..bdbd70e 100755 --- a/scripts/ml/ml_functions/GetMLData.py +++ b/scripts/ml/ml_functions/GetMLData.py @@ -101,14 +101,15 @@ def getmldata(gene, drug datadir = homedir + '/git/Data/' indir = datadir + drug + '/input/' outdir = datadir + drug + '/output/' - outdir_ml = outdir + 'ml/' - + #outdir_ml = outdir + 'ml/' + outdir_ml = homedir + '/git/LSHTM_ML/output/' + #========================== # outfile for ML training: #========================== - outFile_ml = outdir_ml + gene.lower() + '_training_data.csv' + outFile_ml = outdir_ml + gene.lower() + '_training_data.csv' - outFile_mask_ml = outdir_ml + gene.lower() + '_mask_check.csv' + outFile_mask_ml = outdir_ml + 'genes/mask_check/' + gene.lower() + '_mask_check.csv' #======= # input @@ -436,41 +437,58 @@ def getmldata(gene, drug #X_stabilityN = common_cols_stabiltyN gene_affinity_colnames = []# not needed as its the common ones cols_to_mask = ['ligand_affinity_change'] - + cols_to_mask_ppi2 = [] + if gene.lower() in geneL_ppi2: gene_affinity_colnames = ['mcsm_ppi2_affinity', 'interface_dist'] #X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols - cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity'] + #cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity'] + cols_to_mask = ['ligand_affinity_change'] + cols_to_mask_ppi2 = ['mcsm_ppi2_affinity'] + if gene.lower() in geneL_na: gene_affinity_colnames = ['mcsm_na_affinity'] #X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity'] + cols_to_mask_ppi2 = [] + if gene.lower() in geneL_na_ppi2: gene_affinity_colnames = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] #X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols - cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity'] + #cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity'] + cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity'] + cols_to_mask_ppi2 = ['mcsm_ppi2_affinity'] #======================= - # Masking columns: - # (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10 + # Masking columns: + # lig_dist >10 ==> mCSM-lig AND mCSM-NA col values == 0 + # interface_dist >10 ==> mCSM-ppi2 col values == 0 #======================= my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts() my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts() my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts() - # mask the mcsm affinity related columns where ligand distance > 10 + # mask the mcsm ligand affinity AND mcsm_na affinity columns where ligand distance > 10 my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0 - (my_df_ml['ligand_affinity_change'] == 0).sum() + + # mask the mcsm_ppi2_affinity column where interface_dist > 10 + if len(cols_to_mask_ppi2) > 0: + my_df_ml.loc[(my_df_ml['interface_dist'] > 10), cols_to_mask_ppi2] = 0 + add_cols_mask = ['interface_dist'] + cols_to_mask_ppi2 + mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask + add_cols_mask] + else: + mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask ] - mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask] + # sanity check: check script SANITY_CHECK_mask.py - #=================================================== - # write file for check - #mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True) - #mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv') - #=================================================== + if write_maskfile: + # write mask file for sanity check + #mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True) + + mask_check.to_csv(outdir_ml + gene.lower() + '_mask_check.csv') + ############################################################################### #%% Feature groups (FG): Build X for Input ML ############################################################################ diff --git a/scripts/ml/ml_functions/MultClfs.py b/scripts/ml/ml_functions/MultClfs.py index c4a5be1..dafe756 100755 --- a/scripts/ml/ml_functions/MultClfs.py +++ b/scripts/ml/ml_functions/MultClfs.py @@ -77,6 +77,7 @@ import re import itertools from sklearn.model_selection import LeaveOneGroupOut from sklearn.decomposition import PCA +from sklearn.naive_bayes import ComplementNB #%% GLOBALS #rs = {'random_state': 42} @@ -260,6 +261,8 @@ def MultModelsCl(input_df, target #====================================================== models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) ) + #, ('Bernoulli NB' , BernoulliNB() ) # pks Naive Bayes, CAUTION + , ('Complement NB' , ComplementNB() ) , ('Decision Tree' , DecisionTreeClassifier(**rs) ) , ('Extra Tree' , ExtraTreeClassifier(**rs) ) , ('Extra Trees' , ExtraTreesClassifier(**rs) ) @@ -271,8 +274,8 @@ def MultModelsCl(input_df, target , ('Logistic Regression' , LogisticRegression(**rs) ) , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) , ('MLP' , MLPClassifier(max_iter = 500, **rs) ) - , ('Multinomial' , MultinomialNB() ) - , ('Naive Bayes' , BernoulliNB() ) + , ('Multinomial NB' , MultinomialNB() ) + , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) , ('QDA' , QuadraticDiscriminantAnalysis() ) , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) diff --git a/scripts/ml/ml_functions/test_func_singlegene.py b/scripts/ml/ml_functions/test_func_singlegene.py index a132b89..707b188 100644 --- a/scripts/ml/ml_functions/test_func_singlegene.py +++ b/scripts/ml/ml_functions/test_func_singlegene.py @@ -18,7 +18,6 @@ from SplitTTS import * from MultClfs_SIMPLE import * #%% - skf_cv = StratifiedKFold(n_splits = 10 , shuffle = True,**rs) #sel_cv = logo @@ -29,16 +28,16 @@ skf_cv = StratifiedKFold(n_splits = 10 gene_model_paramD = {'data_combined_model' : False , 'use_or' : False , 'omit_all_genomic_features': False - , 'write_maskfile' : False + , 'write_maskfile' : True , 'write_outfile' : False } #df = getmldata(gene, drug, **gene_model_paramD) df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD) -df = getmldata('embB', 'ethambutol' , **gene_model_paramD) -df = getmldata('katG', 'isoniazid' , **gene_model_paramD) -df = getmldata('rpoB', 'rifampicin' , **gene_model_paramD) -df = getmldata('gid' , 'streptomycin' , **gene_model_paramD) -#df = getmldata('alr' , 'cycloserine' , **combined_model_paramD) +#df = getmldata('embB', 'ethambutol' , **gene_model_paramD) +#df = getmldata('katG', 'isoniazid' , **gene_model_paramD) +#df = getmldata('rpoB', 'rifampicin' , **gene_model_paramD) +#df = getmldata('gid' , 'streptomycin' , **gene_model_paramD) +#df = getmldata('alr' , 'cycloserine' , **gene_model_paramD) all(df.columns.isin(['gene_name'])) # should be False spl_type = '70_30'