thorough checking and updates for final running of all gene targets
This commit is contained in:
parent
b66cf31219
commit
bffa3c376c
1 changed files with 137 additions and 90 deletions
|
@ -8,7 +8,7 @@ Created on Tue Aug 6 12:56:03 2019
|
|||
#=======================================================================
|
||||
# Task: combining all dfs to a single one
|
||||
|
||||
# Input: 8 dfs
|
||||
# Input: 12/13/14 dfs
|
||||
#1) <gene>.lower()'_complex_mcsm_norm.csv'
|
||||
#2) <gene>.lower()_foldx.csv'
|
||||
#3) <gene>.lower()_dssp.csv'
|
||||
|
@ -16,20 +16,16 @@ Created on Tue Aug 6 12:56:03 2019
|
|||
#5) <gene>.lower()_rd.csv'
|
||||
#6) 'ns' + <gene>.lower()_snp_info.csv'
|
||||
#7) <gene>.lower()_af_or.csv'
|
||||
#8) <gene>.lower() _af_or_kinship.csv
|
||||
#8) <gene>.lower() _af_or_kinship.csv (ONLY for pncA, but omitted for the final run)
|
||||
#9) <gene>.lower()'_dynamut2.csv'
|
||||
#10) <gene>.lower()'_dynamut.csv'
|
||||
#11) <gene>.lower()'_mcsm_na.csv'
|
||||
#12) <gene>.lower()'_mcsm_ppi2.csv'
|
||||
#13) <gene>.lower()'_consurf.csv'
|
||||
#14) <gene>.lower()'_snap2.csv'
|
||||
|
||||
# combining order
|
||||
#Merge1 = 1 + 2
|
||||
|
||||
#Merge2 = 3 + 4
|
||||
#Merge3 = Merge2 + 5
|
||||
|
||||
#Merge4 = Merge1 + Merge3
|
||||
|
||||
#Merge5 = 6 + 7
|
||||
#Merge6 = Merge5 + 8
|
||||
|
||||
#Merge7 = Merge4 + Merge6
|
||||
|
||||
# Output: single csv of all 8 dfs combined
|
||||
# useful link
|
||||
|
@ -53,10 +49,10 @@ homedir = os.path.expanduser('~')
|
|||
|
||||
# set working dir
|
||||
os.getcwd()
|
||||
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
||||
#os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
||||
sys.path.append(homedir + '/git/LSHTM_analysis/scripts')
|
||||
os.getcwd()
|
||||
|
||||
# FIXME: local imports
|
||||
#from combining import combine_dfs_with_checks
|
||||
from combining_FIXME import detect_common_cols
|
||||
from reference_dict import oneletter_aa_dict
|
||||
|
@ -119,6 +115,7 @@ gene_list_normal = ['pnca', 'katg', 'rpob', 'alr']
|
|||
if gene.lower() == "gid":
|
||||
print("\nReading mCSM file for gene:", gene)
|
||||
in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SRY.csv' # was incorrectly SAM previously
|
||||
|
||||
if gene.lower() == "embb":
|
||||
print("\nReading mCSM file for gene:", gene)
|
||||
#in_filename_mcsm = gene.lower() + '_complex_mcsm_norm1.csv' #798
|
||||
|
@ -183,15 +180,12 @@ infile_snap2 = outdir + 'snap2/'+ infilename_snap2
|
|||
snap2_df = pd.read_csv(infile_snap2, sep = ',')
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# ONLY:for gene pnca and gid: End logic should pick this up!
|
||||
# ONLY: for gene 'gid' and 'rpob': End logic should pick this up!
|
||||
geneL_na = ['gid', 'rpob']
|
||||
if gene.lower() in geneL_na:
|
||||
print("\nGene:", gene.lower()
|
||||
, "\nReading mCSM_na files")
|
||||
# infilename_dynamut = gene.lower() + '_dynamut_norm.csv' # gid
|
||||
# infile_dynamut = outdir + 'dynamut_results/' + infilename_dynamut
|
||||
# dynamut_df = pd.read_csv(infile_dynamut, sep = ',')
|
||||
|
||||
|
||||
infilename_mcsm_na = gene.lower() + '_complex_mcsm_na_norm.csv' # gid
|
||||
infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na
|
||||
mcsm_na_df = pd.read_csv(infile_mcsm_na, sep = ',')
|
||||
|
@ -199,18 +193,13 @@ if gene.lower() in geneL_na:
|
|||
geneL_dy = ['gid']
|
||||
if gene.lower() in geneL_dy:
|
||||
print("\nGene:", gene.lower()
|
||||
, "\nReading Dynamut and mCSM_na files")
|
||||
, "\nReading Dynamut files")
|
||||
infilename_dynamut = gene.lower() + '_dynamut_norm.csv' # gid
|
||||
infile_dynamut = outdir + 'dynamut_results/' + infilename_dynamut
|
||||
dynamut_df = pd.read_csv(infile_dynamut, sep = ',')
|
||||
|
||||
# infilename_mcsm_na = gene.lower() + '_complex_mcsm_na_norm.csv' # gid
|
||||
# infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na
|
||||
# mcsm_na_df = pd.read_csv(infile_mcsm_na, sep = ',')
|
||||
|
||||
# ONLY:for gene embb and alr and katg: End logic should pick this up!
|
||||
geneL_ppi2 = ['embb', 'alr']
|
||||
#if gene.lower() == "embb" or "alr":
|
||||
# ONLY: for genes 'alr', 'embb', 'katg' and 'rpob': End logic should pick this up!
|
||||
geneL_ppi2 = ['alr', 'embb', 'katg', 'rpob']
|
||||
if gene.lower() in geneL_ppi2:
|
||||
infilename_mcsm_ppi2 = gene.lower() + '_complex_mcsm_ppi2_norm.csv'
|
||||
infile_mcsm_ppi2 = outdir + 'mcsm_ppi2/' + infilename_mcsm_ppi2
|
||||
|
@ -224,10 +213,17 @@ else:
|
|||
#=======
|
||||
# output
|
||||
#=======
|
||||
# outfile 3
|
||||
out_filename_comb = gene.lower() + '_all_params.csv'
|
||||
outfile_comb = outdir + out_filename_comb
|
||||
print('\nOutput filename:', outfile_comb
|
||||
, '\n===================================================================')
|
||||
|
||||
# outfile 2
|
||||
out_filename_comb_afor = gene.lower() + '_comb_afor.csv'
|
||||
outfile_comb_afor = outdir + out_filename_comb_afor
|
||||
|
||||
# outfile 1
|
||||
out_filename_stab_struc = gene.lower() + '_comb_stab_struc_params.csv'
|
||||
outfile_stab_struc = outdir + out_filename_stab_struc
|
||||
|
||||
# end of variable assignment for input and output files
|
||||
#%%############################################################################
|
||||
|
@ -235,6 +231,22 @@ print('\nOutput filename:', outfile_comb
|
|||
# some preprocessing
|
||||
#=====================
|
||||
|
||||
#===========
|
||||
# KD
|
||||
#===========
|
||||
kd_df.shape
|
||||
|
||||
# geneL_kd = ['alr']
|
||||
# if gene.lower() in geneL_kd:
|
||||
# print('\nRunning gene:', gene.lower()
|
||||
# ,'\nChecking start numbering')
|
||||
|
||||
if kd_df['wild_type_kd'].str.contains('X').any():
|
||||
print('\nDetected X in wild_type_kd'
|
||||
, '\nRunning gene:', gene.lower()
|
||||
, '\nChecking start numbering')
|
||||
kd_df = kd_df[~kd_df['wild_type_kd'].str.contains('X')]
|
||||
|
||||
#===========
|
||||
# FoldX
|
||||
#===========
|
||||
|
@ -305,7 +317,6 @@ else:
|
|||
|
||||
#=======================
|
||||
# Deepddg
|
||||
# TODO: RERUN 'gid'
|
||||
#=======================
|
||||
deepddg_df.shape
|
||||
|
||||
|
@ -324,7 +335,8 @@ print('\nSelecting chain:', sel_chain, 'for gene:', gene)
|
|||
deepddg_df = deepddg_df[deepddg_df['chain_id'] == sel_chain]
|
||||
|
||||
#--------------------------
|
||||
# Drop chain id col as other targets don't have it.Check for duplicates
|
||||
# Drop chain_id col as other
|
||||
# targets don't have it.
|
||||
#--------------------------
|
||||
col_to_drop = ['chain_id']
|
||||
deepddg_df = deepddg_df.drop(col_to_drop, axis = 1)
|
||||
|
@ -374,14 +386,40 @@ else:
|
|||
, '\nGot:', deepddg_pos2
|
||||
, '\n======================================================')
|
||||
|
||||
if deepddg_df['deepddg_scaled'].min() == -1 and deepddg_df['deepddg_scaled'].max() == 1:
|
||||
print('\nPASS: Deepddg data is scaled between -1 and 1',
|
||||
'\nproceeding with merge')
|
||||
|
||||
#--------------------------
|
||||
# Deepddg outcome category
|
||||
#--------------------------
|
||||
deepddg_df['deepddg_outcome'] = deepddg_df['deepddg'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
|
||||
deepddg_df[deepddg_df['deepddg']>=0].count()
|
||||
doc = deepddg_df['deepddg_outcome'].value_counts()
|
||||
if 'deepddg_outcome' not in deepddg_df.columns:
|
||||
print('\nCreating column: deepddg_outcome')
|
||||
deepddg_df['deepddg_outcome'] = deepddg_df['deepddg'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
|
||||
deepddg_df[deepddg_df['deepddg']>=0].count()
|
||||
doc = deepddg_df['deepddg_outcome'].value_counts()
|
||||
print(doc)
|
||||
else:
|
||||
print('\nColumn exists: deepddg_outcome')
|
||||
t1 = deepddg_df['deepddg_outcome'].value_counts()
|
||||
deepddg_df['deepddg_outcome2'] = deepddg_df['deepddg'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
|
||||
t2 = deepddg_df['deepddg_outcome2'].value_counts()
|
||||
print('\n', t1, '\n', t2)
|
||||
#--------------------------
|
||||
# Drop deepddg_outcome2 col
|
||||
#--------------------------
|
||||
col_to_drop2 = ['deepddg_outcome2']
|
||||
deepddg_df = deepddg_df.drop(col_to_drop2, axis = 1)
|
||||
|
||||
if doc['Stabilising'] == deepddg_pos and doc['Stabilising'] == deepddg_pos2:
|
||||
if all(t1 == t2):
|
||||
print('\nPASS: Deepddg_outcome category checked!')
|
||||
doc = deepddg_df['deepddg_outcome'].value_counts()
|
||||
else:
|
||||
print('\nMISmatch in deepddg_outcome counts'
|
||||
, '\n:', t1
|
||||
, '\n:', t2)
|
||||
|
||||
if doc['Stabilising'] == deepddg_pos and doc['Stabilising'] == deepddg_pos2:
|
||||
print('\nPASS: Deepddg outcome category created')
|
||||
else:
|
||||
print('\nFAIL: Deepddg outcome category could NOT be created'
|
||||
|
@ -389,19 +427,12 @@ else:
|
|||
, '\nGot:', doc[0]
|
||||
, '\n======================================================')
|
||||
sys.exit()
|
||||
|
||||
if deepddg_df['deepddg_scaled'].min() == -1 and deepddg_df['deepddg_scaled'].max() == 1:
|
||||
print('\nPASS: Deepddg data is scaled between -1 and 1',
|
||||
'\nproceeding with merge')
|
||||
|
||||
|
||||
#=======================
|
||||
# Consurf
|
||||
#=======================
|
||||
consurf_df.shape
|
||||
|
||||
# drop row 0: as it contains no value but hangover text
|
||||
consurf_df = consurf_df.drop(index=0)
|
||||
|
||||
#----------------------
|
||||
# rename colums
|
||||
#----------------------
|
||||
|
@ -418,9 +449,9 @@ if gene.lower() in geneL_consurf:
|
|||
print('\nAdding offset value for gene:', gene.lower())
|
||||
|
||||
if gene.lower() == 'alr':
|
||||
offset_val = 34
|
||||
|
||||
offset_val = 34
|
||||
print('\nUsing offset val:', offset_val)
|
||||
|
||||
if gene.lower() == 'katg':
|
||||
offset_val = 23
|
||||
print('\nUsing offset val:', offset_val)
|
||||
|
@ -443,7 +474,7 @@ consurf_df = consurf_df.rename(columns={'SEQ' : 'wild_type'
|
|||
, 'MSADATA' : 'consurf_msa_data'
|
||||
, 'RESIDUEVARIETY' : 'consurf_aa_variety'})
|
||||
# quick check
|
||||
if len(consurf_df) == len(rd_df):
|
||||
if len(consurf_df) == len(kd_df):
|
||||
print('\nPASS: length of consurf df is as expected'
|
||||
, '\nProceeding to format consurf df')
|
||||
else:
|
||||
|
@ -458,6 +489,7 @@ consurf_df['consurf_colour'] = consurf_df['consurf_colour_str'].str.extract(r'(\
|
|||
consurf_df['consurf_colour'] = consurf_df['consurf_colour'].astype(int)
|
||||
|
||||
consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_str'].str.replace(r'.\*','0')
|
||||
# non struc position are assigned a *, replacing that with a 0 so its all integer
|
||||
consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_rev'].astype(int)
|
||||
|
||||
consurf_df['consurf_ci_upper'] = consurf_df['consurf_ci'].str.extract(r'(.*):')
|
||||
|
@ -468,10 +500,10 @@ consurf_df['consurf_ci_lower'] = consurf_df['consurf_ci_lower'].astype(float)
|
|||
|
||||
#consurf_df['wt_3upper_f'] = consurf_df['wt_3upper'].str.extract(r'^\w{3}(\d+.*)')
|
||||
#consurf_df['wt_3upper_f']
|
||||
consurf_df['wt_3upper'] = consurf_df['wt_3upper'].str.replace(r'(\d+:.*)', '')
|
||||
|
||||
consurf_df['chain'] = consurf_df['wt_3upper'].str.extract(r':(.*)')
|
||||
|
||||
consurf_df['wt_3upper'] = consurf_df['wt_3upper'].str.replace(r'(\d+:.*)', '')
|
||||
|
||||
#-------------------------
|
||||
# scale consurf values
|
||||
#-------------------------
|
||||
|
@ -517,21 +549,35 @@ consurf_df.columns
|
|||
#---------------------------
|
||||
# select columns
|
||||
# (and also determine order)
|
||||
# this removes redundant cols:
|
||||
# consurf_colour_str
|
||||
# consurf_ci
|
||||
#---------------------------
|
||||
consurf_df_f = consurf_df[['position'
|
||||
, 'wild_type'
|
||||
, 'chain'
|
||||
, 'wt_3upper'
|
||||
, 'consurf_score'
|
||||
, 'consurf_scaled'
|
||||
, 'consurf_colour'
|
||||
, 'consurf_colour_rev'
|
||||
, 'consurf_ci_upper'
|
||||
, 'consurf_ci_lower'
|
||||
, 'consurf_ci_colour'
|
||||
, 'consurf_msa_data'
|
||||
, 'consurf_aa_variety']]
|
||||
consurf_col_order = ['position'
|
||||
, 'wild_type'
|
||||
, 'chain'
|
||||
, 'wt_3upper'
|
||||
, 'consurf_score'
|
||||
, 'consurf_scaled'
|
||||
, 'consurf_colour'
|
||||
, 'consurf_colour_rev'
|
||||
, 'consurf_ci_upper'
|
||||
, 'consurf_ci_lower'
|
||||
, 'consurf_ci_colour'
|
||||
, 'consurf_msa_data'
|
||||
, 'consurf_aa_variety']
|
||||
|
||||
consurf_df_f = consurf_df[consurf_col_order]
|
||||
|
||||
# CHECK: whether a general rule or a gene specific rule!
|
||||
|
||||
if consurf_df_f['chain'].isna().sum() > 0:
|
||||
print('\nNaN detected in column chain for consurf df')
|
||||
#if gene.lower() == 'embb':
|
||||
print('\nFurther consurf df processing for gene:', gene.lower())
|
||||
print('\nDropping Nan from column name chain')
|
||||
consurf_df_f = consurf_df_f[consurf_df_f['chain'].notna()]
|
||||
|
||||
#=======================
|
||||
# SNAP2
|
||||
#=======================
|
||||
|
@ -610,10 +656,12 @@ else:
|
|||
, '\nGot:', snap2_pos2
|
||||
, '\n======================================================')
|
||||
|
||||
#---------------------------
|
||||
#-------------------------------------
|
||||
# select columns
|
||||
# (and also determine order)
|
||||
#---------------------------
|
||||
# renumbering already done using
|
||||
# bash and corrected file is read in
|
||||
#-------------------------------------
|
||||
snap2_df.dtypes
|
||||
snap2_df.columns
|
||||
|
||||
|
@ -718,7 +766,7 @@ if mcsm_foldx_dfs.loc[:,'wild_type': 'mut_aa_3lower'].isnull().values.any():
|
|||
else:
|
||||
print('\nNo NAs detected in mcsm_fold_dfs. Proceeding to merge deepddg_df')
|
||||
|
||||
#%%
|
||||
#%%============================================================================
|
||||
print('==================================='
|
||||
, '\nSecond merge: mcsm_foldx_dfs + deepddg'
|
||||
, '\n===================================')
|
||||
|
@ -735,7 +783,7 @@ ncols_deepddg_merge = len(mcsm_foldx_deepddg_dfs.columns)
|
|||
mcsm_foldx_deepddg_dfs['position'] = mcsm_foldx_deepddg_dfs['position'].astype('int64')
|
||||
|
||||
#%%============================================================================
|
||||
#FIXME: select df with 'chain' to allow corret dim merging!
|
||||
# Select df with 'chain' to allow corret dim merging!
|
||||
print('==================================='
|
||||
, '\nThird merge: dssp + kd'
|
||||
, '\n===================================')
|
||||
|
@ -755,7 +803,6 @@ dssp_kd_dfs = pd.merge(dssp_df
|
|||
#, how = "outer")
|
||||
, how = "inner")
|
||||
|
||||
|
||||
print('\n\nResult of third merge:', dssp_kd_dfs.shape
|
||||
, '\n===================================================================')
|
||||
#%%============================================================================
|
||||
|
@ -816,7 +863,7 @@ combined_df = pd.merge(mcsm_foldx_deepddg_dfs
|
|||
|
||||
combined_df_expected_cols = ncols_deepddg_merge + ncols_m3 - len(merging_cols_m4)
|
||||
|
||||
# FIXME: check logic, doesn't effect anything else!
|
||||
# Check: whether logic effects anything else!
|
||||
if not gene == "embB":
|
||||
print("\nGene is:", gene)
|
||||
if len(combined_df) == len(mcsm_df) and len(combined_df.columns) == combined_df_expected_cols:
|
||||
|
@ -859,16 +906,13 @@ combined_df_clean = combined_df.drop(cols_to_drop, axis = 1)
|
|||
combined_df_clean.columns
|
||||
del(foo)
|
||||
#%%============================================================================
|
||||
# Output columns
|
||||
out_filename_stab_struc = gene.lower() + '_comb_stab_struc_params.csv'
|
||||
outfile_stab_struc = outdir + out_filename_stab_struc
|
||||
print('Output filename:', outfile_stab_struc
|
||||
, '\n===================================================================')
|
||||
#---------------------
|
||||
# Output 1: write csv
|
||||
#---------------------
|
||||
print('\nWriting file: combined stability and structural parameters'
|
||||
, '\nOutput 1 filename:', outfile_stab_struc
|
||||
, '\n===================================================================\n')
|
||||
|
||||
combined_df_clean
|
||||
|
||||
# write csv
|
||||
print('\nWriting file: combined stability and structural parameters')
|
||||
combined_df_clean.to_csv(outfile_stab_struc, index = False)
|
||||
print('\nFinished writing file:'
|
||||
, '\nNo. of rows:', combined_df_clean.shape[0]
|
||||
|
@ -943,14 +987,14 @@ else:
|
|||
sys.exit('\nFAIL: merge unsuccessful for af and or')
|
||||
|
||||
#%%============================================================================
|
||||
# Output columns: when dynamut, dynamut2 and others weren't being combined
|
||||
out_filename_comb_afor = gene.lower() + '_comb_afor.csv'
|
||||
outfile_comb_afor = outdir + out_filename_comb_afor
|
||||
print('Output filename:', outfile_comb_afor
|
||||
, '\n===================================================================')
|
||||
#---------------------
|
||||
# Output 2: write csv
|
||||
# when dynamut, dynamut2 and others weren't being combined
|
||||
#---------------------
|
||||
print('\nWriting file: combined stability and afor'
|
||||
, '\nOutput 2 filename:', outfile_comb_afor
|
||||
, '\n===================================================================\n')
|
||||
|
||||
# write csv
|
||||
print('Writing file: combined stability and afor')
|
||||
combined_stab_afor.to_csv(outfile_comb_afor, index = False)
|
||||
print('\nFinished writing file:'
|
||||
, '\nNo. of rows:', combined_stab_afor.shape[0]
|
||||
|
@ -966,9 +1010,9 @@ if gene.lower() == "gid":
|
|||
if gene.lower() == "embb":
|
||||
dfs_list = [dynamut2_df, mcsm_ppi2_df]
|
||||
if gene.lower() == "katg":
|
||||
dfs_list = [dynamut2_df]
|
||||
dfs_list = [dynamut2_df, mcsm_ppi2_df]
|
||||
if gene.lower() == "rpob":
|
||||
dfs_list = [dynamut2_df, mcsm_na_df]
|
||||
dfs_list = [dynamut2_df, mcsm_na_df, mcsm_ppi2_df]
|
||||
if gene.lower() == "alr":
|
||||
dfs_list = [dynamut2_df, mcsm_ppi2_df]
|
||||
|
||||
|
@ -1014,7 +1058,6 @@ else:
|
|||
, '\nExpected nrows:', expected_nrows
|
||||
, '\nGot:', len(dfs_merged_clean) )
|
||||
|
||||
# FIXME: need to extract 'cols_to_drop' programatically
|
||||
# Drop cols
|
||||
if combined_all_params.columns.str.contains(r'_x$|_y$', regex = True).any():
|
||||
print('\nDuplicate column names detected...'
|
||||
|
@ -1027,10 +1070,14 @@ if combined_all_params.columns.str.contains(r'_x$|_y$', regex = True).any():
|
|||
else:
|
||||
print('\nNo duplicate column names detected, just writing file'
|
||||
, '\nTotal cols:', len(combined_all_params.columns) )
|
||||
#del(foo)
|
||||
#%% Done for gid on 10/09/2021
|
||||
# write csv
|
||||
print('Writing file: all params')
|
||||
#%%============================================================================
|
||||
#---------------------
|
||||
# Output 3: write csv
|
||||
#---------------------
|
||||
print('\nWriting file: all params')
|
||||
print('\nOutput 3 filename:', outfile_comb
|
||||
, '\n===================================================================\n')
|
||||
|
||||
combined_all_params.to_csv(outfile_comb, index = False)
|
||||
|
||||
print('\nFinished writing file:'
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue