working on dissected model, testing diff feature groups

This commit is contained in:
Tanushree Tunstall 2022-06-20 21:51:07 +01:00
parent 135efcee41
commit e68a153883
4 changed files with 270 additions and 161 deletions

View file

@ -47,60 +47,78 @@ outdir_ml = outdir + 'ml/uq_v1/dissected'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
, '\n\nTotal no. of evolutionary features:' , len(X_evolFN)
, '\n\nTotal no. of stability features:' , len(X_stability_FN)
, '\n--------Common stabilty cols:' , len(X_common_stability_Fnum)
, '\n--------Foldx cols:' , len(X_foldX_Fnum)
, '\n\nTotal no. of affinity features:' , len(X_affinityFN)
, '\n--------Common affinity cols:' , len(common_affinity_Fnum)
, '\n--------Gene specific affinity cols:' , len(gene_affinity_colnames)
, '\n\nTotal no. of residue level features:', len(X_resprop_FN)
, '\n--------AA index cols:' , len(X_aaindex_Fnum)
, '\n--------Residue Prop cols:' , len(X_str_Fnum)
, '\n--------AA change Prop cols:' , len(X_aap_Fcat)
, '\n\nTotal no. of genomic features:' , len(X_genomicFN)
, '\n--------MAF+OR cols:' , len(X_gn_mafor_Fnum)
, '\n--------Lineage cols:' , len(X_gn_linegae_Fnum)
, '\n--------Other cols:' , len(X_gn_Fcat)
# print('AAindex features (n):'
# , len(X_aaindexFN)
# , '\nThese are:\n'
# , X_aaindexFN
# , '\n================================================================\n')
X_structural_FN = X_stability_FN + X_affinityFN + X_resprop_FN
X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat
all_featuresN = X_evolFN + X_structural_FN + X_genomicFN
###############################################################################
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('\n================================================================'
, '\nTotal Evolutionary features (n):' , len(X_evolFN)
, '\n--------------Evol. feature colnames:', X_evolFN
, '\n================================================================'
, '\n\nTotal structural features (n):', len(X_structural_FN)
, '\n--------Stability ncols:' , len(X_stability_FN)
, '\n--------------Common stability colnames:' , X_common_stability_Fnum
, '\n--------------Foldx colnames:' , X_foldX_Fnum
, '\n--------Affinity ncols:' , len(X_affinityFN)
, '\n--------------Common affinity colnames:' , common_affinity_Fnum
, '\n--------------Gene specific affinity colnames:', gene_affinity_colnames
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
, '\n--------Residue prop ncols:' , len(X_resprop_FN)
, '\n--------------Residue Prop cols:' , X_str_Fnum
, '\n--------------AA change Prop cols:' , X_aap_Fcat
, '\n--------------AA index cols:' , X_aaindex_Fnum
, '\n================================================================'
, '\n\nTotal Genomic features (n):' , len(X_genomicFN)
, '\n--------MAF+OR cols:' , len(X_gn_mafor_Fnum)
, '\n--------------MAF+OR colnames:' , X_gn_mafor_Fnum
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
, '\n--------Lineage cols:' , len(X_gn_linegae_Fnum)
, '\n--------------Lineage cols:' , X_gn_linegae_Fnum
#if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
if ( len(X.columns) == len(X_ssFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
, '\n--------Other cols:' , len(X_gn_Fcat)
, '\n--------------Other cols:' , X_gn_Fcat
, '\n================================================================')
# Sanity check
if ( len(X.columns) == len(X_evolFN) + len(X_structural_FN) + len(X_genomicFN)):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\nFail: Count of feature mismatch'
, '\nExpected:', len(X_evolFN) + len(X_structural_FN) + len(X_genomicFN)
, '\nGot:', len(X.columns))
sys.exit()
print('\n#####################################################################\n')
@ -108,7 +126,7 @@ print('\n#####################################################################\n
# #==================
# # Baseline models
# #==================
# mm_skf_scoresD = MultModelsCl(input_df = X
# mm_skf_scoresD = MultModelsCl_dissected(input_df = X
# , target = y
# , var_type = 'mixed'
# , skf_cv = skf_cv