diff --git a/UQ_TODO_categorical_classification_columns.py b/UQ_TODO_categorical_classification_columns.py index f13f8fb..d8c9ebf 100644 --- a/UQ_TODO_categorical_classification_columns.py +++ b/UQ_TODO_categorical_classification_columns.py @@ -6,64 +6,104 @@ Created on Wed May 25 02:01:19 2022 @author: tanu """ # TODO -categorical_cols = ['ss_class', 'wt_prop_water', 'mut_prop_water', 'wt_prop_polarity', - 'mut_prop_polarity', 'wt_calcprop', 'mut_calcprop'] +# categorical_cols = ['ss_class' +# , 'wt_prop_water' +# , 'mut_prop_water' +# , 'wt_prop_polarity' +# , 'mut_prop_polarity' +# , 'wt_calcprop' +# , 'mut_calcprop'] -foo['water_prop_change'] = foo['wt_prop_water'] + str('_to_') + foo['mut_prop_water'] -foo['water_prop_change'].value_counts() +my_df['water_change'] = my_df['wt_prop_water'] + str('_to_') + my_df['mut_prop_water'] +my_df['water_change'].value_counts() water_prop_changeD = { - 'hydrophobic_to_neutral' : '' + 'hydrophobic_to_neutral' : 'change' , 'hydrophobic_to_hydrophobic' : 'no_change' , 'neutral_to_neutral' : 'no_change' - , 'neutral_to_hydrophobic' : '' - , 'hydrophobic_to_hydrophilic' : '' - , 'neutral_to_hydrophilic' : '' - , 'hydrophilic_to_neutral' : '' - , 'hydrophilic_to_hydrophobic' : '' + , 'neutral_to_hydrophobic' : 'change' + , 'hydrophobic_to_hydrophilic' : 'change' + , 'neutral_to_hydrophilic' : 'change' + , 'hydrophilic_to_neutral' : 'change' + , 'hydrophilic_to_hydrophobic' : 'change' , 'hydrophilic_to_hydrophilic' : 'no_change' } -foo['polarity_prop_change'] = foo['wt_prop_polarity'] + str('_to_') + foo['mut_prop_polarity'] -foo['polarity_prop_change'].value_counts() +my_df['water_change'] = my_df['water_change'].map(water_prop_changeD) +my_df['water_change'].value_counts() + +#%% +my_df['polarity_change'] = my_df['wt_prop_polarity'] + str('_to_') + my_df['mut_prop_polarity'] +my_df['polarity_change'].value_counts() # add a no change category polarity_prop_changeD = { 'non-polar_to_non-polar' : 'no_change' - , 'non-polar_to_neutral' : '' - , 'neutral_to_non-polar' : '' - , 'neutral_to_neutral' : '' - , 'non-polar_to_basic' : '' - , 'acidic_to_neutral' : '' - , 'basic_to_neutral' : '' - , 'non-polar_to_acidic' : '' - , 'neutral_to_basic' : '' - , 'acidic_to_non-polar' : '' - , 'basic_to_non-polar' : '' - , 'neutral_to_acidic' : '' + , 'non-polar_to_neutral' : 'change' + , 'neutral_to_non-polar' : 'change' + , 'neutral_to_neutral' : 'no_change' + , 'non-polar_to_basic' : 'change' + , 'acidic_to_neutral' : 'change' + , 'basic_to_neutral' : 'change' + , 'non-polar_to_acidic' : 'change' + , 'neutral_to_basic' : 'change' + , 'acidic_to_non-polar' : 'change' + , 'basic_to_non-polar' : 'change' + , 'neutral_to_acidic' : 'change' , 'acidic_to_acidic' : 'no_change' - , 'basic_to_acidic' : '' + , 'basic_to_acidic' : 'change' , 'basic_to_basic' : 'no_change' - , 'acidic_to_basic' : ''} + , 'acidic_to_basic' : 'change'} +my_df['polarity_change'] = my_df['polarity_change'].map(polarity_prop_changeD) +my_df['polarity_change'].value_counts() -foo['calc_prop_change'] = foo['wt_calcprop'] + str('_to_') + foo['mut_calcprop'] -foo['calc_prop_change'].value_counts() +#%% +my_df['electrostatics_change'] = my_df['wt_calcprop'] + str('_to_') + my_df['mut_calcprop'] +my_df['electrostatics_change'].value_counts() calc_prop_changeD = { 'non-polar_to_non-polar' : 'no_change' - , 'non-polar_to_polar' : '' - , 'polar_to_non-polar' : '' - , 'non-polar_to_pos' : '' - , 'neg_to_non-polar' : '' - , 'non-polar_to_neg' : '' - , 'pos_to_polar' : '' - , 'pos_to_non-polar' : '' + , 'non-polar_to_polar' : 'change' + , 'polar_to_non-polar' : 'change' + , 'non-polar_to_pos' : 'change' + , 'neg_to_non-polar' : 'change' + , 'non-polar_to_neg' : 'change' + , 'pos_to_polar' : 'change' + , 'pos_to_non-polar' : 'change' , 'polar_to_polar' : 'no_change' , 'neg_to_neg' : 'no_change' - , 'polar_to_neg' : '' - , 'pos_to_neg' : '' - , 'pos_to_pos' : '' - , 'polar_to_pos' : '' - , 'neg_to_polar' : '' - , 'neg_to_pos' : '' + , 'polar_to_neg' : 'change' + , 'pos_to_neg' : 'change' + , 'pos_to_pos' : 'no_change' + , 'polar_to_pos' : 'change' + , 'neg_to_polar' : 'change' + , 'neg_to_pos' : 'change' } + +my_df['electrostatics_change'] = my_df['electrostatics_change'].map(calc_prop_changeD) +my_df['electrostatics_change'].value_counts() + +#%% +#https://stackoverflow.com/questions/47181187/finding-string-over-multiple-columns-in-pandas +detect_change = 'change' + +# if detect_change in my_df['water_change'] | my_df['polarity_change'] | my_df['electrostatics_change']: +# print('\nChange detected') + +check = ['mutationinformation', 'wild_type', 'water_change', 'polarity_change', 'electrostatics_change'] +check_prop_cols = ['water_change', 'polarity_change', 'electrostatics_change'] +foo = my_df[check] + +foo['new'] = (foo.values == detect_change).any(1).astype(int) +#foo['new2'] = foo[check_prop_cols].applymap(lambda x: detect_change in x).any(1).astype(int) # lose match so alwasys 1 +foo['new3'] = (foo[check_prop_cols].values == detect_change).any(1).astype(int) + +all(foo['new'] == foo['new3']) +#%%lineage +lineage_colnames = ['lineage', 'lineage_list_all', 'lineage_count_all', 'lineage_count_unique', 'lineage_list_unique', 'lineage_multimode'] +bar = my_df[lineage_colnames] + + +tot_lineage_u = 8 +bar['lineage_proportion'] = bar['lineage_count_unique']/bar['lineage_count_all'] +bar['dist_lineage_proportion'] = bar['lineage_count_unique']/tot_lineage_u \ No newline at end of file diff --git a/pnca_config.py b/pnca_config.py old mode 100644 new mode 100755 index 51b747c..fe0b35a --- a/pnca_config.py +++ b/pnca_config.py @@ -5,29 +5,22 @@ Created on Sat May 28 05:25:30 2022 @author: tanu """ -import os, sys -def MyGlobalVars(): - global gene - global drug - global homedir - gene = 'pncA' - drug = 'pyrazinamide' - homedir = os.path.expanduser("~") - -MyGlobalVars() +import os -os.chdir(homedir + "/git/ML_AI_training/") +gene = 'pncA' +drug = 'pyrazinamide' +total_mtblineage_u = 8 -# my function + +homedir = os.path.expanduser("~") +os.chdir( homedir + '/git/ML_AI_training/') + +from UQ_ML_data import * +setvars(gene,drug) +from UQ_ML_data import * + +# from YC run_all_ML: run locally from UQ_MultModelsCl import MultModelsCl -from UQ_pnca_ML.py import * -# from YC run_all_ML - -# YC_resD2 = run_all_ML(input_pd=X, target_label=y, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed') - -# CVResultsDF = YC_resD2['CrossValResultsDF'] -# CVResultsDF.sort_values(by=['matthew'], ascending=False, inplace=True) -# BTSResultsDF = YC_resD2['BlindTestResultsDF'] -# BTSResultsDF.sort_values(by=['matthew'], ascending=False, inplace=True) +print('TESTING cmd:', Counter(y)) \ No newline at end of file