#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sat Jun 25 11:07:30 2022 @author: tanu """ import sys, os import pandas as pd import numpy as np import re ############################################################################### homedir = os.path.expanduser("~") sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions') sys.path ############################################################################### #==================== # Import ML functions #==================== # from MultClfs import * from GetMLData import * #from SplitTTS import * #%% Load all gene files ####################################################### # param dict combined_model_paramD = {'data_combined_model' : True , 'use_or' : False , 'omit_all_genomic_features': False , 'write_maskfile' : False , 'write_outfile' : False } pnca_df = getmldata('pncA', 'pyrazinamide' , **combined_model_paramD) embb_df = getmldata('embB', 'ethambutol' , **combined_model_paramD) katg_df = getmldata('katG', 'isoniazid' , **combined_model_paramD) rpob_df = getmldata('rpoB', 'rifampicin' , **combined_model_paramD) gid_df = getmldata('gid' , 'streptomycin' , **combined_model_paramD) alr_df = getmldata('alr' , 'cycloserine' , **combined_model_paramD) # quick check foo = pd.concat([alr_df, pnca_df]) check1 = foo.filter(regex= '.*_affinity|gene_name|ligand_distance', axis = 1) # So, pd.concat will join correctly but introduce NAs. # TODO: discuss whether to make these 0 and use it or just omit # For now I am omitting these i.e combining only on common columns expected_nrows = len(pnca_df) + len(embb_df) + len(katg_df) + len(rpob_df) + len(gid_df) + len(alr_df) # finding common columns dfs_combine = [pnca_df, embb_df, katg_df, rpob_df, gid_df, alr_df] common_cols = list(set.intersection(*(set(df.columns) for df in dfs_combine))) expected_ncols = np.min([len(pnca_df.columns)] + [len(embb_df.columns)] + [len(katg_df.columns)] + [len(rpob_df.columns)] + [len(gid_df.columns)] + [len(alr_df.columns)]) expected_ncols if len(common_cols) == expected_ncols: print('\nProceeding to combine based on common cols (n):', len(common_cols)) combined_df = pd.concat([df[common_cols] for df in dfs_combine], ignore_index = False) print('\nSuccessfully combined dfs:' , '\nNo. of dfs combined:', len(dfs_combine) , '\nDim of combined df:', combined_df.shape) else: print('\nFAIL: could not combine dfs, length mismatch' , '\nExpected ncols:', expected_ncols , '\nGot:', len(common_cols)) colnames_combined_df = combined_df.columns if 'gene_name' in colnames_combined_df: print("\nGene name included") else: ('\nGene name NOT included') omit_gene_alr = ['alr'] cm_input_df5 = combined_df[~combined_df['gene_name'].isin(omit_gene_alr)] #%% COMPLETE data combined_df['dst'].isna().sum() combined_df['dst'].value_counts().sum() combined_df_actual = combined_df[~combined_df['dst'].isna()] ##############################################################################