horrible lineage analysis hell

2022-06-28 21:51:02 +01:00 · 2022-06-28 21:51:02 +01:00 · 478df927cc
commit 478df927cc
parent ce0f12382e
10 changed files with 1669 additions and 101 deletions
--- a/scripts/ml/combined_model/ml_data_combined
+++ b/scripts/ml/combined_model/ml_data_combined
@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Jun 25 11:07:30 2022
+
+@author: tanu
+"""
+
+import sys, os
+import pandas as pd
+import numpy as np
+import re
+
+###############################################################################
+homedir = os.path.expanduser("~")
+sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/functions')
+###############################################################################
+#====================
+# Import ML functions 
+#====================
+#from MultClfs import *
+from GetMLData import *
+from SplitTTS import *
+#%% Load all gene files #######################################################
+# param dict
+combined_model_paramD = {'data_combined_model'   : True
+                    , 'use_or'                   : False
+                    , 'omit_all_genomic_features': False
+                    , 'write_maskfile'           : False
+                    , 'write_outfile'            : False }
+
+pnca_df = getmldata('pncA', 'pyrazinamide' , **combined_model_paramD)
+embb_df = getmldata('embB', 'ethambutol'   , **combined_model_paramD)
+katg_df = getmldata('katG', 'isoniazid'    , **combined_model_paramD)
+rpob_df = getmldata('rpoB', 'rifampicin'   , **combined_model_paramD)
+gid_df  = getmldata('gid' , 'streptomycin' , **combined_model_paramD)
+alr_df  = getmldata('alr' , 'cycloserine'  , **combined_model_paramD)
+
+# quick check
+foo = pd.concat([alr_df, pnca_df])
+check1 = foo.filter(regex= '.*_affinity|gene_name|ligand_distance', axis = 1)
+# So, pd.concat will join correctly but introduce NAs.
+# TODO: discuss whether to make these 0 and use it or just omit
+# For now I am omitting these i.e combining only on common columns
+
+expected_nrows  = len(pnca_df) + len(embb_df) + len(katg_df) + len(rpob_df) + len(gid_df) + len(alr_df)
+
+# finding common columns
+dfs_combine = [pnca_df, embb_df, katg_df, rpob_df, gid_df, alr_df]
+common_cols = list(set.intersection(*(set(df.columns) for df in dfs_combine)))
+expected_ncols = np.min([len(pnca_df.columns)] + [len(embb_df.columns)] + [len(katg_df.columns)] + [len(rpob_df.columns)] + [len(gid_df.columns)] + [len(alr_df.columns)])                                                                                                                              
+expected_ncols
+
+if len(common_cols) == expected_ncols:
+    print('\nProceeding to combine based on common cols (n):', len(common_cols))
+    combined_df = pd.concat([df[common_cols] for df in dfs_combine], ignore_index = False)
+    print('\nSuccessfully combined dfs:'
+          , '\nNo. of dfs combined:', len(dfs_combine)
+          , '\nDim of combined df:', combined_df.shape)
+else:
+    print('\nFAIL: could not combine dfs, length mismatch'
+          , '\nExpected ncols:', expected_ncols
+          , '\nGot:',  len(common_cols))
+#%% split data into different data types
+tts_7030_paramD = {'data_type'    : 'actual'
+              , 'split_type'      : '70_30'
+              , 'oversampling'    : True}
+                   
+data_CM_7030D = split_tts(ml_input_data = combined_df
+          , **tts_7030_paramD
+          , dst_colname = 'dst'
+          , target_colname = 'dst_mode'
+          , include_gene_name = False) # when not doing leave one group out