ML_AI_training/earlier_versions/my_data_target_counts.py

81 lines
2 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 3 17:08:18 2022
@author: tanu
"""
#%% load packages
import sys, os
import pandas as pd
from pandas import DataFrame
import numpy as np
import argparse
from functools import reduce
#%%
homedir = os.path.expanduser("~")
os.chdir(homedir + "/git/ML_AI_training/test_data")
#gene = ''
#drug = ''
#==============
# directories
#==============
datadir = homedir + '/git/Data/'
indir = datadir + drug + '/input/'
outdir = datadir + drug + '/output/'
# gene_baiscL = ['pnca']
# geneL_naL = ['gid', 'rpob']
# geneL_ppi2L = ['alr', 'embb', 'katg', 'rpob']
#=======
# input
#=======
infile_ml1 = outdir + gene.lower() + '_merged_df3.csv'
#infile_ml2 = outdir + gene.lower() + '_merged_df2.csv'
my_df = pd.read_csv(infile_ml1)
my_df.dtypes
my_df_cols = my_df.columns
#%%============================================================================
# GET Y
drug_labels = drug + '_labels'
drug_labels
my_df[drug_labels] = my_df[drug]
my_df[drug_labels].value_counts()
my_df[drug_labels] = my_df[drug].map({1: 'resistant', 0: 'sensitive'})
my_df[drug_labels].value_counts()
my_df[drug_labels] = my_df[drug_labels].fillna('unknown')
my_df[drug_labels].value_counts()
mutC = my_df[[ 'mutationinformation']].count()
target1C = my_df['mutation_info_labels'].value_counts()
target2C = my_df[drug_labels].value_counts()
#target2C.index = target2C.index.to_series().map({1: 'resistant', 0: 'sensitive'})
target3C = my_df['drtype'].value_counts()
targetsC = pd.concat([mutC, target1C, target2C, target3C])
targetsC
# targetsC2 = pd.concat([mutC, target1C, target2C
# #, target3C
# ], axis = 1)
# targetsC2
#%% try combinations
# X_vars = X_stability
# X_vars = X_evol
# X_vars = X_str
# X_vars = pd.concat([X_stability, X_evol, X_str], axis = 1)
# X_vars = pd.concat([X_stability, X_evol], axis = 1)
# X_vars = pd.concat([X_stability, X_str], axis = 1)
# X_vars = pd.concat([X_evol, X_str], axis = 1)