thorough checking and updates for final running of all gene targets
This commit is contained in:
parent
b66cf31219
commit
bffa3c376c
1 changed files with 137 additions and 90 deletions
|
@ -8,7 +8,7 @@ Created on Tue Aug 6 12:56:03 2019
|
||||||
#=======================================================================
|
#=======================================================================
|
||||||
# Task: combining all dfs to a single one
|
# Task: combining all dfs to a single one
|
||||||
|
|
||||||
# Input: 8 dfs
|
# Input: 12/13/14 dfs
|
||||||
#1) <gene>.lower()'_complex_mcsm_norm.csv'
|
#1) <gene>.lower()'_complex_mcsm_norm.csv'
|
||||||
#2) <gene>.lower()_foldx.csv'
|
#2) <gene>.lower()_foldx.csv'
|
||||||
#3) <gene>.lower()_dssp.csv'
|
#3) <gene>.lower()_dssp.csv'
|
||||||
|
@ -16,20 +16,16 @@ Created on Tue Aug 6 12:56:03 2019
|
||||||
#5) <gene>.lower()_rd.csv'
|
#5) <gene>.lower()_rd.csv'
|
||||||
#6) 'ns' + <gene>.lower()_snp_info.csv'
|
#6) 'ns' + <gene>.lower()_snp_info.csv'
|
||||||
#7) <gene>.lower()_af_or.csv'
|
#7) <gene>.lower()_af_or.csv'
|
||||||
#8) <gene>.lower() _af_or_kinship.csv
|
#8) <gene>.lower() _af_or_kinship.csv (ONLY for pncA, but omitted for the final run)
|
||||||
|
#9) <gene>.lower()'_dynamut2.csv'
|
||||||
|
#10) <gene>.lower()'_dynamut.csv'
|
||||||
|
#11) <gene>.lower()'_mcsm_na.csv'
|
||||||
|
#12) <gene>.lower()'_mcsm_ppi2.csv'
|
||||||
|
#13) <gene>.lower()'_consurf.csv'
|
||||||
|
#14) <gene>.lower()'_snap2.csv'
|
||||||
|
|
||||||
# combining order
|
# combining order
|
||||||
#Merge1 = 1 + 2
|
|
||||||
|
|
||||||
#Merge2 = 3 + 4
|
|
||||||
#Merge3 = Merge2 + 5
|
|
||||||
|
|
||||||
#Merge4 = Merge1 + Merge3
|
|
||||||
|
|
||||||
#Merge5 = 6 + 7
|
|
||||||
#Merge6 = Merge5 + 8
|
|
||||||
|
|
||||||
#Merge7 = Merge4 + Merge6
|
|
||||||
|
|
||||||
# Output: single csv of all 8 dfs combined
|
# Output: single csv of all 8 dfs combined
|
||||||
# useful link
|
# useful link
|
||||||
|
@ -53,10 +49,10 @@ homedir = os.path.expanduser('~')
|
||||||
|
|
||||||
# set working dir
|
# set working dir
|
||||||
os.getcwd()
|
os.getcwd()
|
||||||
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
#os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
||||||
|
sys.path.append(homedir + '/git/LSHTM_analysis/scripts')
|
||||||
os.getcwd()
|
os.getcwd()
|
||||||
|
|
||||||
# FIXME: local imports
|
|
||||||
#from combining import combine_dfs_with_checks
|
#from combining import combine_dfs_with_checks
|
||||||
from combining_FIXME import detect_common_cols
|
from combining_FIXME import detect_common_cols
|
||||||
from reference_dict import oneletter_aa_dict
|
from reference_dict import oneletter_aa_dict
|
||||||
|
@ -119,6 +115,7 @@ gene_list_normal = ['pnca', 'katg', 'rpob', 'alr']
|
||||||
if gene.lower() == "gid":
|
if gene.lower() == "gid":
|
||||||
print("\nReading mCSM file for gene:", gene)
|
print("\nReading mCSM file for gene:", gene)
|
||||||
in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SRY.csv' # was incorrectly SAM previously
|
in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SRY.csv' # was incorrectly SAM previously
|
||||||
|
|
||||||
if gene.lower() == "embb":
|
if gene.lower() == "embb":
|
||||||
print("\nReading mCSM file for gene:", gene)
|
print("\nReading mCSM file for gene:", gene)
|
||||||
#in_filename_mcsm = gene.lower() + '_complex_mcsm_norm1.csv' #798
|
#in_filename_mcsm = gene.lower() + '_complex_mcsm_norm1.csv' #798
|
||||||
|
@ -183,15 +180,12 @@ infile_snap2 = outdir + 'snap2/'+ infilename_snap2
|
||||||
snap2_df = pd.read_csv(infile_snap2, sep = ',')
|
snap2_df = pd.read_csv(infile_snap2, sep = ',')
|
||||||
|
|
||||||
#------------------------------------------------------------------------------
|
#------------------------------------------------------------------------------
|
||||||
# ONLY:for gene pnca and gid: End logic should pick this up!
|
# ONLY: for gene 'gid' and 'rpob': End logic should pick this up!
|
||||||
geneL_na = ['gid', 'rpob']
|
geneL_na = ['gid', 'rpob']
|
||||||
if gene.lower() in geneL_na:
|
if gene.lower() in geneL_na:
|
||||||
print("\nGene:", gene.lower()
|
print("\nGene:", gene.lower()
|
||||||
, "\nReading mCSM_na files")
|
, "\nReading mCSM_na files")
|
||||||
# infilename_dynamut = gene.lower() + '_dynamut_norm.csv' # gid
|
|
||||||
# infile_dynamut = outdir + 'dynamut_results/' + infilename_dynamut
|
|
||||||
# dynamut_df = pd.read_csv(infile_dynamut, sep = ',')
|
|
||||||
|
|
||||||
infilename_mcsm_na = gene.lower() + '_complex_mcsm_na_norm.csv' # gid
|
infilename_mcsm_na = gene.lower() + '_complex_mcsm_na_norm.csv' # gid
|
||||||
infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na
|
infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na
|
||||||
mcsm_na_df = pd.read_csv(infile_mcsm_na, sep = ',')
|
mcsm_na_df = pd.read_csv(infile_mcsm_na, sep = ',')
|
||||||
|
@ -199,18 +193,13 @@ if gene.lower() in geneL_na:
|
||||||
geneL_dy = ['gid']
|
geneL_dy = ['gid']
|
||||||
if gene.lower() in geneL_dy:
|
if gene.lower() in geneL_dy:
|
||||||
print("\nGene:", gene.lower()
|
print("\nGene:", gene.lower()
|
||||||
, "\nReading Dynamut and mCSM_na files")
|
, "\nReading Dynamut files")
|
||||||
infilename_dynamut = gene.lower() + '_dynamut_norm.csv' # gid
|
infilename_dynamut = gene.lower() + '_dynamut_norm.csv' # gid
|
||||||
infile_dynamut = outdir + 'dynamut_results/' + infilename_dynamut
|
infile_dynamut = outdir + 'dynamut_results/' + infilename_dynamut
|
||||||
dynamut_df = pd.read_csv(infile_dynamut, sep = ',')
|
dynamut_df = pd.read_csv(infile_dynamut, sep = ',')
|
||||||
|
|
||||||
# infilename_mcsm_na = gene.lower() + '_complex_mcsm_na_norm.csv' # gid
|
|
||||||
# infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na
|
|
||||||
# mcsm_na_df = pd.read_csv(infile_mcsm_na, sep = ',')
|
|
||||||
|
|
||||||
# ONLY:for gene embb and alr and katg: End logic should pick this up!
|
# ONLY: for genes 'alr', 'embb', 'katg' and 'rpob': End logic should pick this up!
|
||||||
geneL_ppi2 = ['embb', 'alr']
|
geneL_ppi2 = ['alr', 'embb', 'katg', 'rpob']
|
||||||
#if gene.lower() == "embb" or "alr":
|
|
||||||
if gene.lower() in geneL_ppi2:
|
if gene.lower() in geneL_ppi2:
|
||||||
infilename_mcsm_ppi2 = gene.lower() + '_complex_mcsm_ppi2_norm.csv'
|
infilename_mcsm_ppi2 = gene.lower() + '_complex_mcsm_ppi2_norm.csv'
|
||||||
infile_mcsm_ppi2 = outdir + 'mcsm_ppi2/' + infilename_mcsm_ppi2
|
infile_mcsm_ppi2 = outdir + 'mcsm_ppi2/' + infilename_mcsm_ppi2
|
||||||
|
@ -224,10 +213,17 @@ else:
|
||||||
#=======
|
#=======
|
||||||
# output
|
# output
|
||||||
#=======
|
#=======
|
||||||
|
# outfile 3
|
||||||
out_filename_comb = gene.lower() + '_all_params.csv'
|
out_filename_comb = gene.lower() + '_all_params.csv'
|
||||||
outfile_comb = outdir + out_filename_comb
|
outfile_comb = outdir + out_filename_comb
|
||||||
print('\nOutput filename:', outfile_comb
|
|
||||||
, '\n===================================================================')
|
# outfile 2
|
||||||
|
out_filename_comb_afor = gene.lower() + '_comb_afor.csv'
|
||||||
|
outfile_comb_afor = outdir + out_filename_comb_afor
|
||||||
|
|
||||||
|
# outfile 1
|
||||||
|
out_filename_stab_struc = gene.lower() + '_comb_stab_struc_params.csv'
|
||||||
|
outfile_stab_struc = outdir + out_filename_stab_struc
|
||||||
|
|
||||||
# end of variable assignment for input and output files
|
# end of variable assignment for input and output files
|
||||||
#%%############################################################################
|
#%%############################################################################
|
||||||
|
@ -235,6 +231,22 @@ print('\nOutput filename:', outfile_comb
|
||||||
# some preprocessing
|
# some preprocessing
|
||||||
#=====================
|
#=====================
|
||||||
|
|
||||||
|
#===========
|
||||||
|
# KD
|
||||||
|
#===========
|
||||||
|
kd_df.shape
|
||||||
|
|
||||||
|
# geneL_kd = ['alr']
|
||||||
|
# if gene.lower() in geneL_kd:
|
||||||
|
# print('\nRunning gene:', gene.lower()
|
||||||
|
# ,'\nChecking start numbering')
|
||||||
|
|
||||||
|
if kd_df['wild_type_kd'].str.contains('X').any():
|
||||||
|
print('\nDetected X in wild_type_kd'
|
||||||
|
, '\nRunning gene:', gene.lower()
|
||||||
|
, '\nChecking start numbering')
|
||||||
|
kd_df = kd_df[~kd_df['wild_type_kd'].str.contains('X')]
|
||||||
|
|
||||||
#===========
|
#===========
|
||||||
# FoldX
|
# FoldX
|
||||||
#===========
|
#===========
|
||||||
|
@ -305,7 +317,6 @@ else:
|
||||||
|
|
||||||
#=======================
|
#=======================
|
||||||
# Deepddg
|
# Deepddg
|
||||||
# TODO: RERUN 'gid'
|
|
||||||
#=======================
|
#=======================
|
||||||
deepddg_df.shape
|
deepddg_df.shape
|
||||||
|
|
||||||
|
@ -324,7 +335,8 @@ print('\nSelecting chain:', sel_chain, 'for gene:', gene)
|
||||||
deepddg_df = deepddg_df[deepddg_df['chain_id'] == sel_chain]
|
deepddg_df = deepddg_df[deepddg_df['chain_id'] == sel_chain]
|
||||||
|
|
||||||
#--------------------------
|
#--------------------------
|
||||||
# Drop chain id col as other targets don't have it.Check for duplicates
|
# Drop chain_id col as other
|
||||||
|
# targets don't have it.
|
||||||
#--------------------------
|
#--------------------------
|
||||||
col_to_drop = ['chain_id']
|
col_to_drop = ['chain_id']
|
||||||
deepddg_df = deepddg_df.drop(col_to_drop, axis = 1)
|
deepddg_df = deepddg_df.drop(col_to_drop, axis = 1)
|
||||||
|
@ -374,14 +386,40 @@ else:
|
||||||
, '\nGot:', deepddg_pos2
|
, '\nGot:', deepddg_pos2
|
||||||
, '\n======================================================')
|
, '\n======================================================')
|
||||||
|
|
||||||
|
if deepddg_df['deepddg_scaled'].min() == -1 and deepddg_df['deepddg_scaled'].max() == 1:
|
||||||
|
print('\nPASS: Deepddg data is scaled between -1 and 1',
|
||||||
|
'\nproceeding with merge')
|
||||||
|
|
||||||
#--------------------------
|
#--------------------------
|
||||||
# Deepddg outcome category
|
# Deepddg outcome category
|
||||||
#--------------------------
|
#--------------------------
|
||||||
deepddg_df['deepddg_outcome'] = deepddg_df['deepddg'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
|
if 'deepddg_outcome' not in deepddg_df.columns:
|
||||||
deepddg_df[deepddg_df['deepddg']>=0].count()
|
print('\nCreating column: deepddg_outcome')
|
||||||
doc = deepddg_df['deepddg_outcome'].value_counts()
|
deepddg_df['deepddg_outcome'] = deepddg_df['deepddg'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
|
||||||
|
deepddg_df[deepddg_df['deepddg']>=0].count()
|
||||||
|
doc = deepddg_df['deepddg_outcome'].value_counts()
|
||||||
|
print(doc)
|
||||||
|
else:
|
||||||
|
print('\nColumn exists: deepddg_outcome')
|
||||||
|
t1 = deepddg_df['deepddg_outcome'].value_counts()
|
||||||
|
deepddg_df['deepddg_outcome2'] = deepddg_df['deepddg'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
|
||||||
|
t2 = deepddg_df['deepddg_outcome2'].value_counts()
|
||||||
|
print('\n', t1, '\n', t2)
|
||||||
|
#--------------------------
|
||||||
|
# Drop deepddg_outcome2 col
|
||||||
|
#--------------------------
|
||||||
|
col_to_drop2 = ['deepddg_outcome2']
|
||||||
|
deepddg_df = deepddg_df.drop(col_to_drop2, axis = 1)
|
||||||
|
|
||||||
if doc['Stabilising'] == deepddg_pos and doc['Stabilising'] == deepddg_pos2:
|
if all(t1 == t2):
|
||||||
|
print('\nPASS: Deepddg_outcome category checked!')
|
||||||
|
doc = deepddg_df['deepddg_outcome'].value_counts()
|
||||||
|
else:
|
||||||
|
print('\nMISmatch in deepddg_outcome counts'
|
||||||
|
, '\n:', t1
|
||||||
|
, '\n:', t2)
|
||||||
|
|
||||||
|
if doc['Stabilising'] == deepddg_pos and doc['Stabilising'] == deepddg_pos2:
|
||||||
print('\nPASS: Deepddg outcome category created')
|
print('\nPASS: Deepddg outcome category created')
|
||||||
else:
|
else:
|
||||||
print('\nFAIL: Deepddg outcome category could NOT be created'
|
print('\nFAIL: Deepddg outcome category could NOT be created'
|
||||||
|
@ -389,19 +427,12 @@ else:
|
||||||
, '\nGot:', doc[0]
|
, '\nGot:', doc[0]
|
||||||
, '\n======================================================')
|
, '\n======================================================')
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
if deepddg_df['deepddg_scaled'].min() == -1 and deepddg_df['deepddg_scaled'].max() == 1:
|
|
||||||
print('\nPASS: Deepddg data is scaled between -1 and 1',
|
|
||||||
'\nproceeding with merge')
|
|
||||||
|
|
||||||
#=======================
|
#=======================
|
||||||
# Consurf
|
# Consurf
|
||||||
#=======================
|
#=======================
|
||||||
consurf_df.shape
|
consurf_df.shape
|
||||||
|
|
||||||
# drop row 0: as it contains no value but hangover text
|
|
||||||
consurf_df = consurf_df.drop(index=0)
|
|
||||||
|
|
||||||
#----------------------
|
#----------------------
|
||||||
# rename colums
|
# rename colums
|
||||||
#----------------------
|
#----------------------
|
||||||
|
@ -418,9 +449,9 @@ if gene.lower() in geneL_consurf:
|
||||||
print('\nAdding offset value for gene:', gene.lower())
|
print('\nAdding offset value for gene:', gene.lower())
|
||||||
|
|
||||||
if gene.lower() == 'alr':
|
if gene.lower() == 'alr':
|
||||||
offset_val = 34
|
offset_val = 34
|
||||||
|
|
||||||
print('\nUsing offset val:', offset_val)
|
print('\nUsing offset val:', offset_val)
|
||||||
|
|
||||||
if gene.lower() == 'katg':
|
if gene.lower() == 'katg':
|
||||||
offset_val = 23
|
offset_val = 23
|
||||||
print('\nUsing offset val:', offset_val)
|
print('\nUsing offset val:', offset_val)
|
||||||
|
@ -443,7 +474,7 @@ consurf_df = consurf_df.rename(columns={'SEQ' : 'wild_type'
|
||||||
, 'MSADATA' : 'consurf_msa_data'
|
, 'MSADATA' : 'consurf_msa_data'
|
||||||
, 'RESIDUEVARIETY' : 'consurf_aa_variety'})
|
, 'RESIDUEVARIETY' : 'consurf_aa_variety'})
|
||||||
# quick check
|
# quick check
|
||||||
if len(consurf_df) == len(rd_df):
|
if len(consurf_df) == len(kd_df):
|
||||||
print('\nPASS: length of consurf df is as expected'
|
print('\nPASS: length of consurf df is as expected'
|
||||||
, '\nProceeding to format consurf df')
|
, '\nProceeding to format consurf df')
|
||||||
else:
|
else:
|
||||||
|
@ -458,6 +489,7 @@ consurf_df['consurf_colour'] = consurf_df['consurf_colour_str'].str.extract(r'(\
|
||||||
consurf_df['consurf_colour'] = consurf_df['consurf_colour'].astype(int)
|
consurf_df['consurf_colour'] = consurf_df['consurf_colour'].astype(int)
|
||||||
|
|
||||||
consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_str'].str.replace(r'.\*','0')
|
consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_str'].str.replace(r'.\*','0')
|
||||||
|
# non struc position are assigned a *, replacing that with a 0 so its all integer
|
||||||
consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_rev'].astype(int)
|
consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_rev'].astype(int)
|
||||||
|
|
||||||
consurf_df['consurf_ci_upper'] = consurf_df['consurf_ci'].str.extract(r'(.*):')
|
consurf_df['consurf_ci_upper'] = consurf_df['consurf_ci'].str.extract(r'(.*):')
|
||||||
|
@ -468,10 +500,10 @@ consurf_df['consurf_ci_lower'] = consurf_df['consurf_ci_lower'].astype(float)
|
||||||
|
|
||||||
#consurf_df['wt_3upper_f'] = consurf_df['wt_3upper'].str.extract(r'^\w{3}(\d+.*)')
|
#consurf_df['wt_3upper_f'] = consurf_df['wt_3upper'].str.extract(r'^\w{3}(\d+.*)')
|
||||||
#consurf_df['wt_3upper_f']
|
#consurf_df['wt_3upper_f']
|
||||||
consurf_df['wt_3upper'] = consurf_df['wt_3upper'].str.replace(r'(\d+:.*)', '')
|
|
||||||
|
|
||||||
consurf_df['chain'] = consurf_df['wt_3upper'].str.extract(r':(.*)')
|
consurf_df['chain'] = consurf_df['wt_3upper'].str.extract(r':(.*)')
|
||||||
|
|
||||||
|
consurf_df['wt_3upper'] = consurf_df['wt_3upper'].str.replace(r'(\d+:.*)', '')
|
||||||
|
|
||||||
#-------------------------
|
#-------------------------
|
||||||
# scale consurf values
|
# scale consurf values
|
||||||
#-------------------------
|
#-------------------------
|
||||||
|
@ -517,21 +549,35 @@ consurf_df.columns
|
||||||
#---------------------------
|
#---------------------------
|
||||||
# select columns
|
# select columns
|
||||||
# (and also determine order)
|
# (and also determine order)
|
||||||
|
# this removes redundant cols:
|
||||||
|
# consurf_colour_str
|
||||||
|
# consurf_ci
|
||||||
#---------------------------
|
#---------------------------
|
||||||
consurf_df_f = consurf_df[['position'
|
consurf_col_order = ['position'
|
||||||
, 'wild_type'
|
, 'wild_type'
|
||||||
, 'chain'
|
, 'chain'
|
||||||
, 'wt_3upper'
|
, 'wt_3upper'
|
||||||
, 'consurf_score'
|
, 'consurf_score'
|
||||||
, 'consurf_scaled'
|
, 'consurf_scaled'
|
||||||
, 'consurf_colour'
|
, 'consurf_colour'
|
||||||
, 'consurf_colour_rev'
|
, 'consurf_colour_rev'
|
||||||
, 'consurf_ci_upper'
|
, 'consurf_ci_upper'
|
||||||
, 'consurf_ci_lower'
|
, 'consurf_ci_lower'
|
||||||
, 'consurf_ci_colour'
|
, 'consurf_ci_colour'
|
||||||
, 'consurf_msa_data'
|
, 'consurf_msa_data'
|
||||||
, 'consurf_aa_variety']]
|
, 'consurf_aa_variety']
|
||||||
|
|
||||||
|
consurf_df_f = consurf_df[consurf_col_order]
|
||||||
|
|
||||||
|
# CHECK: whether a general rule or a gene specific rule!
|
||||||
|
|
||||||
|
if consurf_df_f['chain'].isna().sum() > 0:
|
||||||
|
print('\nNaN detected in column chain for consurf df')
|
||||||
|
#if gene.lower() == 'embb':
|
||||||
|
print('\nFurther consurf df processing for gene:', gene.lower())
|
||||||
|
print('\nDropping Nan from column name chain')
|
||||||
|
consurf_df_f = consurf_df_f[consurf_df_f['chain'].notna()]
|
||||||
|
|
||||||
#=======================
|
#=======================
|
||||||
# SNAP2
|
# SNAP2
|
||||||
#=======================
|
#=======================
|
||||||
|
@ -610,10 +656,12 @@ else:
|
||||||
, '\nGot:', snap2_pos2
|
, '\nGot:', snap2_pos2
|
||||||
, '\n======================================================')
|
, '\n======================================================')
|
||||||
|
|
||||||
#---------------------------
|
#-------------------------------------
|
||||||
# select columns
|
# select columns
|
||||||
# (and also determine order)
|
# (and also determine order)
|
||||||
#---------------------------
|
# renumbering already done using
|
||||||
|
# bash and corrected file is read in
|
||||||
|
#-------------------------------------
|
||||||
snap2_df.dtypes
|
snap2_df.dtypes
|
||||||
snap2_df.columns
|
snap2_df.columns
|
||||||
|
|
||||||
|
@ -718,7 +766,7 @@ if mcsm_foldx_dfs.loc[:,'wild_type': 'mut_aa_3lower'].isnull().values.any():
|
||||||
else:
|
else:
|
||||||
print('\nNo NAs detected in mcsm_fold_dfs. Proceeding to merge deepddg_df')
|
print('\nNo NAs detected in mcsm_fold_dfs. Proceeding to merge deepddg_df')
|
||||||
|
|
||||||
#%%
|
#%%============================================================================
|
||||||
print('==================================='
|
print('==================================='
|
||||||
, '\nSecond merge: mcsm_foldx_dfs + deepddg'
|
, '\nSecond merge: mcsm_foldx_dfs + deepddg'
|
||||||
, '\n===================================')
|
, '\n===================================')
|
||||||
|
@ -735,7 +783,7 @@ ncols_deepddg_merge = len(mcsm_foldx_deepddg_dfs.columns)
|
||||||
mcsm_foldx_deepddg_dfs['position'] = mcsm_foldx_deepddg_dfs['position'].astype('int64')
|
mcsm_foldx_deepddg_dfs['position'] = mcsm_foldx_deepddg_dfs['position'].astype('int64')
|
||||||
|
|
||||||
#%%============================================================================
|
#%%============================================================================
|
||||||
#FIXME: select df with 'chain' to allow corret dim merging!
|
# Select df with 'chain' to allow corret dim merging!
|
||||||
print('==================================='
|
print('==================================='
|
||||||
, '\nThird merge: dssp + kd'
|
, '\nThird merge: dssp + kd'
|
||||||
, '\n===================================')
|
, '\n===================================')
|
||||||
|
@ -755,7 +803,6 @@ dssp_kd_dfs = pd.merge(dssp_df
|
||||||
#, how = "outer")
|
#, how = "outer")
|
||||||
, how = "inner")
|
, how = "inner")
|
||||||
|
|
||||||
|
|
||||||
print('\n\nResult of third merge:', dssp_kd_dfs.shape
|
print('\n\nResult of third merge:', dssp_kd_dfs.shape
|
||||||
, '\n===================================================================')
|
, '\n===================================================================')
|
||||||
#%%============================================================================
|
#%%============================================================================
|
||||||
|
@ -816,7 +863,7 @@ combined_df = pd.merge(mcsm_foldx_deepddg_dfs
|
||||||
|
|
||||||
combined_df_expected_cols = ncols_deepddg_merge + ncols_m3 - len(merging_cols_m4)
|
combined_df_expected_cols = ncols_deepddg_merge + ncols_m3 - len(merging_cols_m4)
|
||||||
|
|
||||||
# FIXME: check logic, doesn't effect anything else!
|
# Check: whether logic effects anything else!
|
||||||
if not gene == "embB":
|
if not gene == "embB":
|
||||||
print("\nGene is:", gene)
|
print("\nGene is:", gene)
|
||||||
if len(combined_df) == len(mcsm_df) and len(combined_df.columns) == combined_df_expected_cols:
|
if len(combined_df) == len(mcsm_df) and len(combined_df.columns) == combined_df_expected_cols:
|
||||||
|
@ -859,16 +906,13 @@ combined_df_clean = combined_df.drop(cols_to_drop, axis = 1)
|
||||||
combined_df_clean.columns
|
combined_df_clean.columns
|
||||||
del(foo)
|
del(foo)
|
||||||
#%%============================================================================
|
#%%============================================================================
|
||||||
# Output columns
|
#---------------------
|
||||||
out_filename_stab_struc = gene.lower() + '_comb_stab_struc_params.csv'
|
# Output 1: write csv
|
||||||
outfile_stab_struc = outdir + out_filename_stab_struc
|
#---------------------
|
||||||
print('Output filename:', outfile_stab_struc
|
print('\nWriting file: combined stability and structural parameters'
|
||||||
, '\n===================================================================')
|
, '\nOutput 1 filename:', outfile_stab_struc
|
||||||
|
, '\n===================================================================\n')
|
||||||
|
|
||||||
combined_df_clean
|
|
||||||
|
|
||||||
# write csv
|
|
||||||
print('\nWriting file: combined stability and structural parameters')
|
|
||||||
combined_df_clean.to_csv(outfile_stab_struc, index = False)
|
combined_df_clean.to_csv(outfile_stab_struc, index = False)
|
||||||
print('\nFinished writing file:'
|
print('\nFinished writing file:'
|
||||||
, '\nNo. of rows:', combined_df_clean.shape[0]
|
, '\nNo. of rows:', combined_df_clean.shape[0]
|
||||||
|
@ -943,14 +987,14 @@ else:
|
||||||
sys.exit('\nFAIL: merge unsuccessful for af and or')
|
sys.exit('\nFAIL: merge unsuccessful for af and or')
|
||||||
|
|
||||||
#%%============================================================================
|
#%%============================================================================
|
||||||
# Output columns: when dynamut, dynamut2 and others weren't being combined
|
#---------------------
|
||||||
out_filename_comb_afor = gene.lower() + '_comb_afor.csv'
|
# Output 2: write csv
|
||||||
outfile_comb_afor = outdir + out_filename_comb_afor
|
# when dynamut, dynamut2 and others weren't being combined
|
||||||
print('Output filename:', outfile_comb_afor
|
#---------------------
|
||||||
, '\n===================================================================')
|
print('\nWriting file: combined stability and afor'
|
||||||
|
, '\nOutput 2 filename:', outfile_comb_afor
|
||||||
|
, '\n===================================================================\n')
|
||||||
|
|
||||||
# write csv
|
|
||||||
print('Writing file: combined stability and afor')
|
|
||||||
combined_stab_afor.to_csv(outfile_comb_afor, index = False)
|
combined_stab_afor.to_csv(outfile_comb_afor, index = False)
|
||||||
print('\nFinished writing file:'
|
print('\nFinished writing file:'
|
||||||
, '\nNo. of rows:', combined_stab_afor.shape[0]
|
, '\nNo. of rows:', combined_stab_afor.shape[0]
|
||||||
|
@ -966,9 +1010,9 @@ if gene.lower() == "gid":
|
||||||
if gene.lower() == "embb":
|
if gene.lower() == "embb":
|
||||||
dfs_list = [dynamut2_df, mcsm_ppi2_df]
|
dfs_list = [dynamut2_df, mcsm_ppi2_df]
|
||||||
if gene.lower() == "katg":
|
if gene.lower() == "katg":
|
||||||
dfs_list = [dynamut2_df]
|
dfs_list = [dynamut2_df, mcsm_ppi2_df]
|
||||||
if gene.lower() == "rpob":
|
if gene.lower() == "rpob":
|
||||||
dfs_list = [dynamut2_df, mcsm_na_df]
|
dfs_list = [dynamut2_df, mcsm_na_df, mcsm_ppi2_df]
|
||||||
if gene.lower() == "alr":
|
if gene.lower() == "alr":
|
||||||
dfs_list = [dynamut2_df, mcsm_ppi2_df]
|
dfs_list = [dynamut2_df, mcsm_ppi2_df]
|
||||||
|
|
||||||
|
@ -1014,7 +1058,6 @@ else:
|
||||||
, '\nExpected nrows:', expected_nrows
|
, '\nExpected nrows:', expected_nrows
|
||||||
, '\nGot:', len(dfs_merged_clean) )
|
, '\nGot:', len(dfs_merged_clean) )
|
||||||
|
|
||||||
# FIXME: need to extract 'cols_to_drop' programatically
|
|
||||||
# Drop cols
|
# Drop cols
|
||||||
if combined_all_params.columns.str.contains(r'_x$|_y$', regex = True).any():
|
if combined_all_params.columns.str.contains(r'_x$|_y$', regex = True).any():
|
||||||
print('\nDuplicate column names detected...'
|
print('\nDuplicate column names detected...'
|
||||||
|
@ -1027,10 +1070,14 @@ if combined_all_params.columns.str.contains(r'_x$|_y$', regex = True).any():
|
||||||
else:
|
else:
|
||||||
print('\nNo duplicate column names detected, just writing file'
|
print('\nNo duplicate column names detected, just writing file'
|
||||||
, '\nTotal cols:', len(combined_all_params.columns) )
|
, '\nTotal cols:', len(combined_all_params.columns) )
|
||||||
#del(foo)
|
#%%============================================================================
|
||||||
#%% Done for gid on 10/09/2021
|
#---------------------
|
||||||
# write csv
|
# Output 3: write csv
|
||||||
print('Writing file: all params')
|
#---------------------
|
||||||
|
print('\nWriting file: all params')
|
||||||
|
print('\nOutput 3 filename:', outfile_comb
|
||||||
|
, '\n===================================================================\n')
|
||||||
|
|
||||||
combined_all_params.to_csv(outfile_comb, index = False)
|
combined_all_params.to_csv(outfile_comb, index = False)
|
||||||
|
|
||||||
print('\nFinished writing file:'
|
print('\nFinished writing file:'
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue