finalised categorical and lineage col classifications

2022-05-29 05:22:01 +01:00 · 2022-05-29 05:22:01 +01:00 · 084c280f16
commit 084c280f16
parent c37780350e
2 changed files with 94 additions and 61 deletions
--- a/UQ_TODO_categorical_classification_columns.py
+++ b/UQ_TODO_categorical_classification_columns.py
@ -6,64 +6,104 @@ Created on Wed May 25 02:01:19 2022
@author: tanu
 """
 # TODO
-categorical_cols = ['ss_class', 'wt_prop_water', 'mut_prop_water', 'wt_prop_polarity',
-       'mut_prop_polarity', 'wt_calcprop', 'mut_calcprop']
+# categorical_cols = ['ss_class'
+#                     , 'wt_prop_water'
+#                     , 'mut_prop_water'
+#                     , 'wt_prop_polarity'
+#                     , 'mut_prop_polarity'
+#                     , 'wt_calcprop'
+#                     , 'mut_calcprop']

-foo['water_prop_change'] = foo['wt_prop_water'] + str('_to_') + foo['mut_prop_water']
-foo['water_prop_change'].value_counts()
+my_df['water_change'] = my_df['wt_prop_water'] + str('_to_') + my_df['mut_prop_water']
+my_df['water_change'].value_counts()
 water_prop_changeD = {
-    'hydrophobic_to_neutral'          : ''
+    'hydrophobic_to_neutral'          : 'change'
    , 'hydrophobic_to_hydrophobic'    : 'no_change'
    , 'neutral_to_neutral'            : 'no_change'
-    , 'neutral_to_hydrophobic'        : ''
-    , 'hydrophobic_to_hydrophilic'    : ''
-    , 'neutral_to_hydrophilic'        : ''
-    , 'hydrophilic_to_neutral'        : ''
-    , 'hydrophilic_to_hydrophobic'    : ''
+    , 'neutral_to_hydrophobic'        : 'change'
+    , 'hydrophobic_to_hydrophilic'    : 'change'
+    , 'neutral_to_hydrophilic'        : 'change'
+    , 'hydrophilic_to_neutral'        : 'change'
+    , 'hydrophilic_to_hydrophobic'    : 'change'
    , 'hydrophilic_to_hydrophilic'    : 'no_change'
 }

-foo['polarity_prop_change'] = foo['wt_prop_polarity'] + str('_to_') + foo['mut_prop_polarity']
-foo['polarity_prop_change'].value_counts()
+my_df['water_change'] = my_df['water_change'].map(water_prop_changeD)
+my_df['water_change'].value_counts()
+
+#%%
+my_df['polarity_change'] = my_df['wt_prop_polarity'] + str('_to_') + my_df['mut_prop_polarity']
+my_df['polarity_change'].value_counts()
 # add a no change category

 polarity_prop_changeD = {
    'non-polar_to_non-polar'     : 'no_change'
-    , 'non-polar_to_neutral'     : ''  
-    , 'neutral_to_non-polar'     : ''  
-    , 'neutral_to_neutral'       : ''  
-    , 'non-polar_to_basic'       : ''  
-    , 'acidic_to_neutral'        : ''  
-    , 'basic_to_neutral'         : ''  
-    , 'non-polar_to_acidic'      : ''  
-    , 'neutral_to_basic'         : ''  
-    , 'acidic_to_non-polar'      : ''  
-    , 'basic_to_non-polar'       : ''
-    , 'neutral_to_acidic'        : ''
+    , 'non-polar_to_neutral'     : 'change'  
+    , 'neutral_to_non-polar'     : 'change'  
+    , 'neutral_to_neutral'       : 'no_change'  
+    , 'non-polar_to_basic'       : 'change'  
+    , 'acidic_to_neutral'        : 'change'  
+    , 'basic_to_neutral'         : 'change'  
+    , 'non-polar_to_acidic'      : 'change'  
+    , 'neutral_to_basic'         : 'change'  
+    , 'acidic_to_non-polar'      : 'change'  
+    , 'basic_to_non-polar'       : 'change'
+    , 'neutral_to_acidic'        : 'change'
    , 'acidic_to_acidic'         : 'no_change'
-    , 'basic_to_acidic'          : ''
+    , 'basic_to_acidic'          : 'change'
    , 'basic_to_basic'           : 'no_change'
-    , 'acidic_to_basic'          : ''}
+    , 'acidic_to_basic'          : 'change'}

+my_df['polarity_change'] = my_df['polarity_change'].map(polarity_prop_changeD)
+my_df['polarity_change'].value_counts()

-foo['calc_prop_change'] = foo['wt_calcprop'] + str('_to_') + foo['mut_calcprop']
-foo['calc_prop_change'].value_counts()
+#%%
+my_df['electrostatics_change'] = my_df['wt_calcprop'] + str('_to_') + my_df['mut_calcprop']
+my_df['electrostatics_change'].value_counts()

 calc_prop_changeD = {
        'non-polar_to_non-polar'     : 'no_change'
-        , 'non-polar_to_polar'       : ''
-        , 'polar_to_non-polar'       : ''
-        , 'non-polar_to_pos'         : ''
-        , 'neg_to_non-polar'         : ''
-        , 'non-polar_to_neg'         : ''
-        , 'pos_to_polar'             : ''
-        , 'pos_to_non-polar'         : ''
+        , 'non-polar_to_polar'       : 'change'
+        , 'polar_to_non-polar'       : 'change'
+        , 'non-polar_to_pos'         : 'change'
+        , 'neg_to_non-polar'         : 'change'
+        , 'non-polar_to_neg'         : 'change'
+        , 'pos_to_polar'             : 'change'
+        , 'pos_to_non-polar'         : 'change'
        , 'polar_to_polar'           : 'no_change'
        , 'neg_to_neg'               : 'no_change'
-        , 'polar_to_neg'             : ''
-        , 'pos_to_neg'               : ''
-        , 'pos_to_pos'               : ''
-        , 'polar_to_pos'             : ''
-        , 'neg_to_polar'             : ''
-        , 'neg_to_pos'               : ''
+        , 'polar_to_neg'             : 'change'
+        , 'pos_to_neg'               : 'change'
+        , 'pos_to_pos'               : 'no_change'
+        , 'polar_to_pos'             : 'change'
+        , 'neg_to_polar'             : 'change'
+        , 'neg_to_pos'               : 'change'
 }
+
+my_df['electrostatics_change'] = my_df['electrostatics_change'].map(calc_prop_changeD)
+my_df['electrostatics_change'].value_counts()
+
+#%%
+#https://stackoverflow.com/questions/47181187/finding-string-over-multiple-columns-in-pandas
+detect_change = 'change'
+
+# if detect_change in my_df['water_change'] | my_df['polarity_change'] | my_df['electrostatics_change']:
+#     print('\nChange detected')
+
+check = ['mutationinformation', 'wild_type', 'water_change', 'polarity_change', 'electrostatics_change']
+check_prop_cols = ['water_change', 'polarity_change', 'electrostatics_change']
+foo = my_df[check]
+
+foo['new'] = (foo.values == detect_change).any(1).astype(int)
+#foo['new2'] = foo[check_prop_cols].applymap(lambda x: detect_change in x).any(1).astype(int) # lose match so alwasys 1
+foo['new3'] = (foo[check_prop_cols].values == detect_change).any(1).astype(int)
+
+all(foo['new'] == foo['new3'])
+#%%lineage
+lineage_colnames = ['lineage', 'lineage_list_all', 'lineage_count_all', 'lineage_count_unique', 'lineage_list_unique', 'lineage_multimode']
+bar = my_df[lineage_colnames]
+
+
+tot_lineage_u = 8
+bar['lineage_proportion'] = bar['lineage_count_unique']/bar['lineage_count_all']
+bar['dist_lineage_proportion'] = bar['lineage_count_unique']/tot_lineage_u
--- a/pnca_config.py
+++ b/pnca_config.py
@ -5,29 +5,22 @@ Created on Sat May 28 05:25:30 2022

@author: tanu
 """
-import os, sys

-def MyGlobalVars():
-    global gene
-    global drug 
-    global homedir
-    gene  = 'pncA'
-    drug  = 'pyrazinamide'
-    homedir = os.path.expanduser("~")
+import os

-MyGlobalVars()
+gene  = 'pncA'
+drug  = 'pyrazinamide'
+total_mtblineage_u = 8

-os.chdir(homedir + "/git/ML_AI_training/")

-# my function
+homedir = os.path.expanduser("~")
+os.chdir( homedir + '/git/ML_AI_training/')
+
+from UQ_ML_data import *
+setvars(gene,drug)
+from UQ_ML_data import *
+
+# from YC run_all_ML: run locally
 from UQ_MultModelsCl import MultModelsCl
-from UQ_pnca_ML.py import *
-# from YC run_all_ML
-
-# YC_resD2 = run_all_ML(input_pd=X, target_label=y, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed')
-
-# CVResultsDF = YC_resD2['CrossValResultsDF']
-# CVResultsDF.sort_values(by=['matthew'], ascending=False, inplace=True)
-# BTSResultsDF = YC_resD2['BlindTestResultsDF']
-# BTSResultsDF.sort_values(by=['matthew'], ascending=False, inplace=True)

+print('TESTING cmd:', Counter(y))