Merge branch 'master' of github.com:tgttunstall/LSHTM_analysis
This commit is contained in:
commit
5102bbea1b
21 changed files with 2132 additions and 243 deletions
|
@ -54,6 +54,8 @@ os.getcwd()
|
|||
# FIXME: local imports
|
||||
#from combining import combine_dfs_with_checks
|
||||
from combining_FIXME import detect_common_cols
|
||||
from reference_dict import oneletter_aa_dict # CHECK DIR STRUC THERE!
|
||||
from reference_dict import low_3letter_dict # CHECK DIR STRUC THERE!
|
||||
#=======================================================================
|
||||
#%% command line args
|
||||
arg_parser = argparse.ArgumentParser()
|
||||
|
@ -71,13 +73,27 @@ args = arg_parser.parse_args()
|
|||
#%% variable assignment: input and output
|
||||
#drug = 'pyrazinamide'
|
||||
#gene = 'pncA'
|
||||
gene_match = gene + '_p.'
|
||||
|
||||
drug = args.drug
|
||||
gene = args.gene
|
||||
datadir = args.datadir
|
||||
indir = args.input_dir
|
||||
outdir = args.output_dir
|
||||
|
||||
gene_match = gene + '_p.'
|
||||
print('mut pattern for gene', gene, ':', gene_match)
|
||||
|
||||
nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
|
||||
print('nsSNP for gene', gene, ':', nssnp_match)
|
||||
|
||||
wt_regex = gene_match.lower()+'([A-Za-z]{3})'
|
||||
print('wt regex:', wt_regex)
|
||||
|
||||
mut_regex = r'[0-9]+(\w{3})$'
|
||||
print('mt regex:', mut_regex)
|
||||
|
||||
pos_regex = r'([0-9]+)'
|
||||
print('position regex:', pos_regex)
|
||||
#%%=======================================================================
|
||||
#==============
|
||||
# directories
|
||||
|
@ -155,6 +171,8 @@ ncols_m1 = len(mcsm_foldx_dfs.columns)
|
|||
|
||||
print('\n\nResult of first merge:', mcsm_foldx_dfs.shape
|
||||
, '\n===================================================================')
|
||||
mcsm_foldx_dfs[merging_cols_m1].apply(len)
|
||||
mcsm_foldx_dfs[merging_cols_m1].apply(len) == len(mcsm_foldx_dfs)
|
||||
#%%============================================================================
|
||||
print('==================================='
|
||||
, '\nSecond merge: dssp + kd'
|
||||
|
@ -183,6 +201,8 @@ ncols_m3 = len(dssp_kd_rd_dfs.columns)
|
|||
|
||||
print('\n\nResult of Third merge:', dssp_kd_rd_dfs.shape
|
||||
, '\n===================================================================')
|
||||
dssp_kd_rd_dfs[merging_cols_m3].apply(len)
|
||||
dssp_kd_rd_dfs[merging_cols_m3].apply(len) == len(dssp_kd_rd_dfs)
|
||||
#%%============================================================================
|
||||
print('======================================='
|
||||
, '\nFourth merge: First merge + Third merge'
|
||||
|
@ -203,12 +223,16 @@ else:
|
|||
|
||||
print('\nResult of Fourth merge:', combined_df.shape
|
||||
, '\n===================================================================')
|
||||
combined_df[merging_cols_m4].apply(len)
|
||||
combined_df[merging_cols_m4].apply(len) == len(combined_df)
|
||||
#%%============================================================================
|
||||
|
||||
# OR merges: TEDIOUSSSS!!!!
|
||||
|
||||
|
||||
#%%RRRR
|
||||
del(mcsm_df, foldx_df, mcsm_foldx_dfs, dssp_kd_dfs, dssp_kd_rd_dfs,rd_df, kd_df, infile_mcsm, infile_foldx, infile_dssp, infile_kd)
|
||||
del(merging_cols_m1, merging_cols_m2, merging_cols_m3, merging_cols_m4)
|
||||
del(in_filename_dssp, in_filename_foldx, in_filename_kd, in_filename_mcsm, in_filename_rd)
|
||||
#%%
|
||||
print('==================================='
|
||||
, '\nFifth merge: afor_df + afor_kin_df'
|
||||
, '\n===================================')
|
||||
|
@ -220,8 +244,6 @@ afor_df = pd.read_csv(infile_afor, sep = ',')
|
|||
afor_kin_df = pd.read_csv(infile_afor_kin, sep = ',')
|
||||
afor_kin_df.columns = afor_kin_df.columns.str.lower()
|
||||
|
||||
|
||||
|
||||
merging_cols_m5 = detect_common_cols(afor_df, afor_kin_df)
|
||||
|
||||
print('Dim of afor_df:', afor_df.shape
|
||||
|
@ -230,7 +252,7 @@ print('Dim of afor_df:', afor_df.shape
|
|||
# finding if ALL afor_kin_df muts are present in afor_df
|
||||
# i.e all kinship muts should be PRESENT in mycalcs_present
|
||||
if len(afor_kin_df[afor_kin_df['mutation'].isin(afor_df['mutation'])]) == afor_kin_df.shape[0]:
|
||||
print('PASS: ALL or_kinship muts are present in my or list')
|
||||
print('PASS: ALL', len(afor_kin_df), 'or_kinship muts are present in my or list')
|
||||
else:
|
||||
nf_muts = len(afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])])
|
||||
nf_muts_df = afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])]
|
||||
|
@ -241,10 +263,10 @@ else:
|
|||
|
||||
# Now checking how many afor_df muts are NOT present in afor_kin_df
|
||||
common_muts = len(afor_df[afor_df['mutation'].isin(afor_kin_df['mutation'])])
|
||||
extra_muts_myor = afor_kin_df.shape[0] - common_muts
|
||||
extra_muts_myor = afor_kin_df.shape[0] - common_muts
|
||||
|
||||
print('=========================================='
|
||||
, '\nmy or calcs', extra_muts_myor, 'extra mutation\n'
|
||||
, '\nmy or calcs has', common_muts, 'present in af_or_kin_df'
|
||||
, '\n==========================================')
|
||||
|
||||
print('Expected cals for merging with outer_join...')
|
||||
|
@ -252,23 +274,29 @@ print('Expected cals for merging with outer_join...')
|
|||
expected_rows = afor_df.shape[0] + extra_muts_myor
|
||||
expected_cols = afor_df.shape[1] + afor_kin_df.shape[1] - len(merging_cols_m5)
|
||||
|
||||
|
||||
afor_df['mutation']
|
||||
afor_kin_df['mutation']
|
||||
|
||||
ors_df = pd.merge(afor_df, afor_kin_df, on = merging_cols_m5, how = o_join)
|
||||
|
||||
if ors_df.shape[0] == expected_rows and ors_df.shape[1] == expected_cols:
|
||||
print('PASS: OR dfs successfully combined! PHEWWWW!')
|
||||
print('PASS but with duplicate muts: OR dfs successfully combined! PHEWWWW!'
|
||||
, '\nDuplicate muts present but with different \'ref\' and \'alt\' alleles')
|
||||
else:
|
||||
print('FAIL: could not combine OR dfs'
|
||||
, '\nCheck expected rows and cols calculation and join type')
|
||||
|
||||
print('Dim of merged ors_df:', ors_df.shape)
|
||||
|
||||
ors_df[merging_cols_m5].apply(len)
|
||||
ors_df[merging_cols_m5].apply(len) == len(ors_df)
|
||||
|
||||
#%%============================================================================
|
||||
# formatting ors_df
|
||||
|
||||
ors_df.columns
|
||||
|
||||
|
||||
# Dropping unncessary columns: already removed in ealier preprocessing
|
||||
#cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ]
|
||||
cols_to_drop = ['n_miss']
|
||||
print('Dropping', len(cols_to_drop), 'columns:\n'
|
||||
, cols_to_drop)
|
||||
|
@ -283,18 +311,13 @@ column_order = ['mutation'
|
|||
, 'wild_type'
|
||||
, 'position'
|
||||
, 'mutant_type'
|
||||
#, 'chr_num_allele' #old
|
||||
, 'ref_allele'
|
||||
, 'alt_allele'
|
||||
, 'mut_info_f1'
|
||||
, 'mut_info_f2'
|
||||
, 'mut_type'
|
||||
, 'gene_id'
|
||||
#, 'gene_number' #old
|
||||
, 'gene_name'
|
||||
#, 'mut_region'
|
||||
#, 'reference_allele'
|
||||
#, 'alternate_allele'
|
||||
, 'chromosome_number'
|
||||
, 'af'
|
||||
, 'af_kin'
|
||||
|
@ -317,14 +340,9 @@ column_order = ['mutation'
|
|||
, 'se_kin'
|
||||
, 'zval_logistic'
|
||||
, 'logl_h1_kin'
|
||||
, 'l_remle_kin'
|
||||
#, 'wt_3let' # old
|
||||
#, 'mt_3let' # old
|
||||
#, 'symbol'
|
||||
#, 'n_miss'
|
||||
]
|
||||
, 'l_remle_kin']
|
||||
|
||||
if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all()):
|
||||
if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all())):
|
||||
print('PASS: Column order generated for all:', len(column_order), 'columns'
|
||||
, '\nColumn names match, safe to reorder columns'
|
||||
, '\nApplying column order to df...' )
|
||||
|
@ -337,6 +355,61 @@ else:
|
|||
|
||||
print('\nResult of Sixth merge:', ors_df_ordered.shape
|
||||
, '\n===================================================================')
|
||||
#%%
|
||||
ors_df_ordered.shape
|
||||
check = ors_df_ordered[['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type']]
|
||||
|
||||
# populating 'nan' info
|
||||
lookup_dict = dict()
|
||||
for k, v in low_3letter_dict.items():
|
||||
lookup_dict[k] = v['one_letter_code']
|
||||
#print(lookup_dict)
|
||||
|
||||
wt = ors_df_ordered['mutation'].str.extract(wt_regex).squeeze()
|
||||
#print(wt)
|
||||
ors_df_ordered['wild_type'] = wt.map(lookup_dict)
|
||||
|
||||
ors_df_ordered['position'] = ors_df_ordered['mutation'].str.extract(pos_regex)
|
||||
|
||||
mt = ors_df_ordered['mutation'].str.extract(mut_regex).squeeze()
|
||||
ors_df_ordered['mutant_type'] = mt.map(lookup_dict)
|
||||
|
||||
ors_df_ordered['mutationinformation'] = ors_df_ordered['wild_type'] + ors_df_ordered.position.map(str) + ors_df_ordered['mutant_type']
|
||||
check = ors_df_ordered[['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type']]
|
||||
|
||||
# populate mut_info_f1
|
||||
ors_df_ordered['mut_info_f1'].isna().sum()
|
||||
ors_df_ordered['mut_info_f1'] = ors_df_ordered['position'].astype(str) + ors_df_ordered['wild_type'] + '>' + ors_df_ordered['position'].astype(str) + ors_df_ordered['mutant_type']
|
||||
ors_df_ordered['mut_info_f1'].isna().sum()
|
||||
|
||||
# populate mut_info_f2
|
||||
ors_df_ordered['mut_info_f2'] = ors_df_ordered['mutation'].str.replace(gene_match.lower(), 'p.', regex = True)
|
||||
|
||||
# populate mut_type
|
||||
ors_df_ordered['mut_type'].isna().sum()
|
||||
#mut_type_word = ors_df_ordered['mut_type'].value_counts()
|
||||
mut_type_word = 'missense' # FIXME, should be derived
|
||||
ors_df_ordered['mut_type'].fillna(mut_type_word, inplace = True)
|
||||
ors_df_ordered['mut_type'].isna().sum()
|
||||
|
||||
# populate gene_id
|
||||
ors_df_ordered['gene_id'].isna().sum()
|
||||
#gene_id_word = ors_df_ordered['gene_id'].value_counts()
|
||||
gene_id_word = 'Rv2043c' # FIXME, should be derived
|
||||
ors_df_ordered['gene_id'].fillna(gene_id_word, inplace = True)
|
||||
ors_df_ordered['gene_id'].isna().sum()
|
||||
|
||||
# populate gene_name
|
||||
ors_df_ordered['gene_name'].isna().sum()
|
||||
ors_df_ordered['gene_name'].value_counts()
|
||||
ors_df_ordered['gene_name'].fillna(gene, inplace = True)
|
||||
ors_df_ordered['gene_name'].isna().sum()
|
||||
|
||||
# check numbers
|
||||
ors_df_ordered['or_kin'].isna().sum()
|
||||
# should be 0
|
||||
ors_df_ordered['or_mychisq'].isna().sum()
|
||||
|
||||
#%%============================================================================
|
||||
print('==================================='
|
||||
, '\nSixth merge: Fourth + Fifth merge'
|
||||
|
@ -344,53 +417,159 @@ print('==================================='
|
|||
, '\n===================================')
|
||||
|
||||
#combined_df_all = combine_dfs_with_checks(combined_df, ors_df_ordered, my_join = i_join)
|
||||
merging_cols_m6 = detect_common_cols(combined_df, ors_df_ordered)
|
||||
merging_cols_m6 = detect_common_cols(combined_df, ors_df_ordered)
|
||||
|
||||
# dtype problems
|
||||
if len(merging_cols_m6) > 1 and 'position'in merging_cols_m6:
|
||||
print('Removing \'position\' from merging_cols_m6 to make dtypes consistent'
|
||||
, '\norig length of merging_cols_m6:', len(merging_cols_m6))
|
||||
merging_cols_m6.remove('position')
|
||||
print('\nlength after removing:', len(merging_cols_m6))
|
||||
|
||||
print('Dim of df1:', combined_df.shape
|
||||
, '\nDim of df2:', ors_df_ordered.shape
|
||||
, '\nNo. of merging_cols:', len(merging_cols_m6))
|
||||
|
||||
print('Checking mutations in the two dfs:'
|
||||
, '\nmuts in df1 but NOT in df2:'
|
||||
, '\nmuts in df1 present in df2:'
|
||||
, combined_df['mutationinformation'].isin(ors_df_ordered['mutationinformation']).sum()
|
||||
, '\nmuts in df2 but NOT in df1:'
|
||||
, '\nmuts in df2 present in df1:'
|
||||
, ors_df_ordered['mutationinformation'].isin(combined_df['mutationinformation']).sum())
|
||||
|
||||
#print('\nNo. of common muts:', np.intersect1d(combined_df['mutationinformation'], ors_df_ordered['mutationinformation']) )
|
||||
|
||||
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m6, how = l_join)
|
||||
#----------
|
||||
# merge 6
|
||||
#----------
|
||||
combined_df_all = pd.merge(combined_df, ors_df_ordered, on = merging_cols_m6, how = o_join)
|
||||
combined_df_all.shape
|
||||
|
||||
# FIXME: DIM
|
||||
# only with left join!
|
||||
outdf_expected_rows = len(combined_df)
|
||||
# sanity check for merge 6
|
||||
outdf_expected_rows = len(combined_df) + extra_muts_myor
|
||||
unique_muts = len(combined_df)
|
||||
outdf_expected_cols = len(combined_df.columns) + len(ors_df_ordered.columns) - len(merging_cols_m6)
|
||||
|
||||
#if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all.shape[0] == outdf_expected_rows:
|
||||
if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all['mutationinformation'].nunique() == outdf_expected_rows:
|
||||
if combined_df_all.shape[0] == outdf_expected_rows and combined_df_all.shape[1] == outdf_expected_cols and combined_df_all['mutationinformation'].nunique() == unique_muts:
|
||||
print('PASS: Df dimension match'
|
||||
, '\nDim of combined_df_all with join type:', l_join
|
||||
, '\ncombined_df_all with join type:', o_join
|
||||
, '\n', combined_df_all.shape
|
||||
, '\n===============================================================')
|
||||
else:
|
||||
print('FAIL: Df dimension mismatch'
|
||||
, 'Cannot generate expected dim. See details of merge performed'
|
||||
, '\ndf1 dim:', combined_df.shape
|
||||
, '\ndf2 dim:', ors_df.shape
|
||||
, '\ndf2 dim:', ors_df_ordered.shape
|
||||
, '\nGot:', combined_df_all.shape
|
||||
, '\nmuts in df1 but NOT in df2:'
|
||||
, combined_df['mutationinformation'].isin(ors_df['mutationinformation']).sum()
|
||||
, combined_df['mutationinformation'].isin(ors_df_ordered['mutationinformation']).sum()
|
||||
, '\nmuts in df2 but NOT in df1:'
|
||||
, ors_df['mutationinformation'].isin(combined_df['mutationinformation']).sum())
|
||||
sys.exit()
|
||||
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
# nan in mutation col
|
||||
# FIXME: should get fixmed with JP's resolved dataset!?
|
||||
combined_df_all['mutation'].isna().sum()
|
||||
baz = combined_df_all[combined_df_all['mutation'].isna()]
|
||||
|
||||
# drop extra cols
|
||||
all_cols = combined_df_all.columns
|
||||
|
||||
#pos_cols_check = combined_df_all[['position_x','position_y']]
|
||||
c = combined_df_all[['position_x','position_y']].isna().sum()
|
||||
pos_col_to_drop = c.index[c>0].to_list()
|
||||
cols_to_drop = pos_col_to_drop + ['wild_type_kd']
|
||||
|
||||
print('Dropping', len(cols_to_drop), 'columns:\n', cols_to_drop)
|
||||
combined_df_all.drop(cols_to_drop, axis = 1, inplace = True)
|
||||
|
||||
# rename position_x to position
|
||||
pos_col_to_rename = c.index[c==0].to_list()
|
||||
combined_df_all.shape
|
||||
combined_df_all.rename(columns = { pos_col_to_rename[0]: 'position'}, inplace = True)
|
||||
combined_df_all.shape
|
||||
|
||||
all_cols = combined_df_all.columns
|
||||
|
||||
#%% reorder cols to for convenience
|
||||
first_cols = ['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type']
|
||||
last_cols = [col for col in combined_df_all.columns if col not in first_cols]
|
||||
|
||||
combined_df_all = combined_df_all[first_cols+last_cols]
|
||||
|
||||
#%% IMPORTANT: check if mutation related info is all populated after this merge
|
||||
# select string colnames to ensure no NA exist there
|
||||
string_cols = combined_df_all.columns[combined_df_all.applymap(lambda x: isinstance(x, str)).all(0)]
|
||||
|
||||
if (combined_df_all[string_cols].isna().sum(axis = 0)== 0).all():
|
||||
print('PASS: All string cols are populated with no NAs')
|
||||
else:
|
||||
print('FAIL: NAs detected in string cols')
|
||||
print(combined_df_all[string_cols].isna().sum(axis = 0))
|
||||
sys.exit()
|
||||
|
||||
# relevant mut cols
|
||||
check_mut_cols = merging_cols_m5 + merging_cols_m6
|
||||
|
||||
count_na_mut_cols = combined_df_all[check_mut_cols].isna().sum().reset_index().rename(columns = {'index': 'col_name', 0: 'na_count'})
|
||||
print(check_mut_cols)
|
||||
|
||||
c2 = combined_df_all[check_mut_cols].isna().sum()
|
||||
missing_info_cols = c2.index[c2>0].to_list()
|
||||
|
||||
if c2.sum()>0:
|
||||
#na_muts_n = combined_df_all['mutation'].isna().sum()
|
||||
na_muts_n = combined_df_all[missing_info_cols].isna().sum()
|
||||
print(na_muts_n.values[0], 'mutations have missing \'mutation\' info.'
|
||||
, '\nFetching these from reference dict...')
|
||||
else:
|
||||
print('No missing \'mutation\' has been detected!')
|
||||
|
||||
lookup_dict = dict()
|
||||
for k, v in oneletter_aa_dict.items():
|
||||
lookup_dict[k] = v['three_letter_code_lower']
|
||||
print(lookup_dict)
|
||||
wt_3let = combined_df_all['wild_type'].map(lookup_dict)
|
||||
#print(wt_3let)
|
||||
pos = combined_df_all['position'].astype(str)
|
||||
#print(pos)
|
||||
mt_3let = combined_df_all['mutant_type'].map(lookup_dict)
|
||||
#print(mt_3let)
|
||||
# override the 'mutation' column
|
||||
combined_df_all['mutation'] = 'pnca_p.' + wt_3let + pos + mt_3let
|
||||
print(combined_df_all['mutation'])
|
||||
|
||||
# check again
|
||||
if combined_df_all[missing_info_cols].isna().sum().all() == 0:
|
||||
print('PASS: No mutations have missing \'mutation\' info.')
|
||||
else:
|
||||
print('FAIL:', combined_df_all[missing_info_cols].isna().sum().values[0]
|
||||
, '\nmutations have missing info STILL...')
|
||||
sys.exit()
|
||||
|
||||
#%% check
|
||||
foo = combined_df_all.drop_duplicates('mutationinformation')
|
||||
foo2 = combined_df_all.drop_duplicates('mutation')
|
||||
if foo.equals(foo2):
|
||||
print('PASS: Dropping mutation or mutatationinformation has the same effect\n')
|
||||
else:
|
||||
print('FAIL: Still problems in merged data')
|
||||
sys.exit()
|
||||
|
||||
#%%============================================================================
|
||||
output_cols = combined_df_all.columns
|
||||
print('Output cols:', output_cols)
|
||||
|
||||
#%% IMPORTANT result info
|
||||
if combined_df_all['or_mychisq'].isna().sum() == len(combined_df) - len(afor_df):
|
||||
print('PASS: No. of NA in or_mychisq matches expected length'
|
||||
, '\nNo. of with NA in or_mychisq:', combined_df_all['or_mychisq'].isna().sum()
|
||||
, '\nNo. of NA in or_kin:', combined_df_all['or_kin'].isna().sum())
|
||||
else:
|
||||
print('FAIL: No. of NA in or_mychisq does not match expected length')
|
||||
|
||||
|
||||
if combined_df_all.shape[0] == outdf_expected_rows:
|
||||
print('\nINFORMARIONAL ONLY: combined_df_all has duplicate muts present but with unique ref and alt allele'
|
||||
, '\n=============================================================')
|
||||
else:
|
||||
print('combined_df_all has no duplicate muts present'
|
||||
,'\n===============================================================')
|
||||
|
||||
print('\nDim of combined_data:', combined_df_all.shape
|
||||
, '\nNo. of unique mutations:', combined_df_all['mutationinformation'].nunique())
|
||||
|
||||
|
||||
#%%============================================================================
|
||||
# write csv
|
||||
|
|
|
@ -45,8 +45,6 @@ Created on Tue Aug 6 12:56:03 2019
|
|||
#5. chain
|
||||
#6. wild_pos
|
||||
#7. wild_chain_pos
|
||||
|
||||
|
||||
#=======================================================================
|
||||
#%% load libraries
|
||||
import os, sys
|
||||
|
@ -83,16 +81,16 @@ gene = args.gene
|
|||
gene_match = gene + '_p.'
|
||||
print('mut pattern for gene', gene, ':', gene_match)
|
||||
|
||||
nssnp_match = gene_match +'[A-Z]{3}[0-9]+[A-Z]{3}'
|
||||
nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
|
||||
print('nsSNP for gene', gene, ':', nssnp_match)
|
||||
|
||||
wt_regex = gene_match.lower()+'(\w{3})'
|
||||
wt_regex = gene_match.lower()+'([A-Za-z]{3})'
|
||||
print('wt regex:', wt_regex)
|
||||
|
||||
mut_regex = r'\d+(\w{3})$'
|
||||
mut_regex = r'[0-9]+(\w{3})$'
|
||||
print('mt regex:', mut_regex)
|
||||
|
||||
pos_regex = r'(\d+)'
|
||||
pos_regex = r'([0-9]+)'
|
||||
print('position regex:', pos_regex)
|
||||
|
||||
# building cols to extract
|
||||
|
@ -156,30 +154,29 @@ if in_filename_master == 'original_tanushree_data_v2.csv':
|
|||
else:
|
||||
core_cols = ['id'
|
||||
, 'sample'
|
||||
, 'patient_id'
|
||||
, 'strain'
|
||||
#, 'patient_id'
|
||||
#, 'strain'
|
||||
, 'lineage'
|
||||
, 'sublineage'
|
||||
, 'country'
|
||||
#, 'country'
|
||||
, 'country_code'
|
||||
, 'geographic_source'
|
||||
#, 'region'
|
||||
, 'location'
|
||||
, 'host_body_site'
|
||||
, 'environment_material'
|
||||
, 'host_status'
|
||||
, 'host_sex'
|
||||
, 'submitted_host_sex'
|
||||
, 'hiv_status'
|
||||
, 'HIV_status'
|
||||
, 'tissue_type'
|
||||
, 'isolation_source'
|
||||
#, 'location'
|
||||
#, 'host_body_site'
|
||||
#, 'environment_material'
|
||||
#, 'host_status'
|
||||
#, 'host_sex'
|
||||
#, 'submitted_host_sex'
|
||||
#, 'hiv_status'
|
||||
#, 'HIV_status'
|
||||
#, 'tissue_type'
|
||||
#, 'isolation_source'
|
||||
, resistance_col]
|
||||
|
||||
variable_based_cols = [drug
|
||||
, dr_muts_col
|
||||
, other_muts_col]
|
||||
#, resistance_col]
|
||||
|
||||
cols_to_extract = core_cols + variable_based_cols
|
||||
print('Extracting', len(cols_to_extract), 'columns from master data')
|
||||
|
@ -202,7 +199,7 @@ print('No. of NAs/column:' + '\n', meta_data.isna().sum()
|
|||
|
||||
#%% Write check file
|
||||
check_file = outdir + '/' + gene.lower() + '_gwas.csv'
|
||||
meta_data.to_csv(check_file)
|
||||
meta_data.to_csv(check_file, index = False)
|
||||
print('Writing subsetted gwas data'
|
||||
, '\nFile', check_file
|
||||
, '\nDim:', meta_data.shape)
|
||||
|
@ -217,9 +214,9 @@ print('Writing subsetted gwas data'
|
|||
# drug counts: complete samples for OR calcs
|
||||
meta_data[drug].value_counts()
|
||||
print('===========================================================\n'
|
||||
, 'RESULT: No. of Sus and Res samples:\n', meta_data[drug].value_counts()
|
||||
, 'RESULT: No. of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts()
|
||||
, '\n===========================================================\n'
|
||||
, 'RESULT: Percentage of Sus and Res samples:\n', meta_data[drug].value_counts(normalize = True)*100
|
||||
, 'RESULT: Percentage of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100
|
||||
, '\n===========================================================')
|
||||
|
||||
#%%
|
||||
|
@ -1173,7 +1170,7 @@ del(out_filename_metadata_poscounts)
|
|||
|
||||
#%% Write file: gene_metadata (i.e gene_LF1)
|
||||
# where each row has UNIQUE mutations NOT unique sample ids
|
||||
out_filename_metadata = gene.lower() + '_metadata.csv'
|
||||
out_filename_metadata = gene.lower() + '_metadata_raw.csv'
|
||||
outfile_metadata = outdir + '/' + out_filename_metadata
|
||||
print('Writing file: LF formatted data'
|
||||
, '\nFile:', outfile_metadata
|
||||
|
|
211
scripts/ks_test_PS.R
Normal file
211
scripts/ks_test_PS.R
Normal file
|
@ -0,0 +1,211 @@
|
|||
#!/usr/bin/env Rscript
|
||||
#########################################################
|
||||
# TASK: KS test for PS/DUET lineage distributions
|
||||
#=======================================================================
|
||||
#=======================================================================
|
||||
# working dir and loading libraries
|
||||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/scripts/")
|
||||
getwd()
|
||||
|
||||
#source("/plotting/Header_TT.R")
|
||||
#source("../barplot_colour_function.R")
|
||||
#require(data.table)
|
||||
source("plotting/combining_dfs_plotting.R")
|
||||
# should return the following dfs, directories and variables
|
||||
|
||||
# PS combined:
|
||||
# 1) merged_df2
|
||||
# 2) merged_df2_comp
|
||||
# 3) merged_df3
|
||||
# 4) merged_df3_comp
|
||||
|
||||
# LIG combined:
|
||||
# 5) merged_df2_lig
|
||||
# 6) merged_df2_comp_lig
|
||||
# 7) merged_df3_lig
|
||||
# 8) merged_df3_comp_lig
|
||||
|
||||
# 9) my_df_u
|
||||
# 10) my_df_u_lig
|
||||
|
||||
cat(paste0("Directories imported:"
|
||||
, "\ndatadir:", datadir
|
||||
, "\nindir:", indir
|
||||
, "\noutdir:", outdir
|
||||
, "\nplotdir:", plotdir))
|
||||
|
||||
cat(paste0("Variables imported:"
|
||||
, "\ndrug:", drug
|
||||
, "\ngene:", gene
|
||||
, "\ngene_match:", gene_match
|
||||
, "\nAngstrom symbol:", angstroms_symbol
|
||||
, "\nNo. of duplicated muts:", dup_muts_nu
|
||||
, "\nNA count for ORs:", na_count
|
||||
, "\nNA count in df2:", na_count_df2
|
||||
, "\nNA count in df3:", na_count_df3))
|
||||
|
||||
###########################
|
||||
# Data for stats
|
||||
# you need merged_df2 or merged_df2_comp
|
||||
# since this is one-many relationship
|
||||
# i.e the same SNP can belong to multiple lineages
|
||||
# using the _comp dataset means
|
||||
# we lose some muts and at this level, we should use
|
||||
# as much info as available, hence use df with NA
|
||||
###########################
|
||||
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df2
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
is.factor(my_df$lineage)
|
||||
my_df$lineage = as.factor(my_df$lineage)
|
||||
is.factor(my_df$lineage)
|
||||
|
||||
table(my_df$mutation_info); str(my_df$mutation_info)
|
||||
|
||||
# subset df with dr muts only
|
||||
my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide")
|
||||
table(my_df_dr$mutation_info)
|
||||
|
||||
# stats for all muts and dr_muts
|
||||
# 1) for all muts
|
||||
# 2) for dr_muts
|
||||
#===========================
|
||||
table(my_df$lineage); str(my_df$lineage)
|
||||
table(my_df_dr$lineage); str(my_df_dr$lineage)
|
||||
|
||||
# subset only lineages1-4
|
||||
sel_lineages = c("lineage1"
|
||||
, "lineage2"
|
||||
, "lineage3"
|
||||
, "lineage4")
|
||||
|
||||
# subset and refactor: all muts
|
||||
df_lin = subset(my_df, subset = lineage %in% sel_lineages)
|
||||
df_lin$lineage = factor(df_lin$lineage)
|
||||
|
||||
# subset and refactor: dr muts
|
||||
df_lin_dr = subset(my_df_dr, subset = lineage %in% sel_lineages)
|
||||
df_lin_dr$lineage = factor(df_lin_dr$lineage)
|
||||
|
||||
|
||||
#{RESULT: No of samples within lineage}
|
||||
table(df_lin$lineage)
|
||||
table(df_lin_dr$lineage)
|
||||
|
||||
#{Result: No. of unique mutations the 4 lineages contribute to}
|
||||
length(unique(df_lin$mutationinformation))
|
||||
length(unique(df_lin_dr$mutationinformation))
|
||||
|
||||
|
||||
# COMPARING DISTRIBUTIONS
|
||||
#================
|
||||
# ALL mutations
|
||||
#=================
|
||||
head(df_lin$lineage)
|
||||
df_lin$lineage = as.character(df_lin$lineage)
|
||||
|
||||
lin1 = df_lin[df_lin$lineage == "lineage1",]$duet_scaled
|
||||
lin2 = df_lin[df_lin$lineage == "lineage2",]$duet_scaled
|
||||
lin3 = df_lin[df_lin$lineage == "lineage3",]$duet_scaled
|
||||
lin4 = df_lin[df_lin$lineage == "lineage4",]$duet_scaled
|
||||
|
||||
# ks test
|
||||
lin12 = ks.test(lin1,lin2)
|
||||
lin12_df = as.data.frame(cbind(lin12$data.name, lin12$p.value))
|
||||
|
||||
lin13 = ks.test(lin1,lin3)
|
||||
lin13_df = as.data.frame(cbind(lin13$data.name, lin13$p.value))
|
||||
|
||||
lin14 = ks.test(lin1,lin4)
|
||||
lin14_df = as.data.frame(cbind(lin14$data.name, lin14$p.value))
|
||||
|
||||
lin23 = ks.test(lin2,lin3)
|
||||
lin23_df = as.data.frame(cbind(lin23$data.name, lin23$p.value))
|
||||
|
||||
lin24 = ks.test(lin2,lin4)
|
||||
lin24_df = as.data.frame(cbind(lin24$data.name, lin24$p.value))
|
||||
|
||||
lin34 = ks.test(lin3,lin4)
|
||||
lin34_df = as.data.frame(cbind(lin34$data.name, lin34$p.value))
|
||||
|
||||
ks_results_all = rbind(lin12_df
|
||||
, lin13_df
|
||||
, lin14_df
|
||||
, lin23_df
|
||||
, lin24_df
|
||||
, lin34_df)
|
||||
|
||||
#p-value < 2.2e-16
|
||||
rm(lin12, lin12_df
|
||||
, lin13, lin13_df
|
||||
, lin14, lin14_df
|
||||
, lin23, lin23_df
|
||||
, lin24, lin24_df
|
||||
, lin34, lin34_df)
|
||||
|
||||
#================
|
||||
# DRUG mutations
|
||||
#=================
|
||||
head(df_lin_dr$lineage)
|
||||
df_lin_dr$lineage = as.character(df_lin_dr$lineage)
|
||||
|
||||
lin1_dr = df_lin_dr[df_lin_dr$lineage == "lineage1",]$duet_scaled
|
||||
lin2_dr = df_lin_dr[df_lin_dr$lineage == "lineage2",]$duet_scaled
|
||||
lin3_dr = df_lin_dr[df_lin_dr$lineage == "lineage3",]$duet_scaled
|
||||
lin4_dr = df_lin_dr[df_lin_dr$lineage == "lineage4",]$duet_scaled
|
||||
|
||||
# ks test: dr muts
|
||||
lin12_dr = ks.test(lin1_dr,lin2_dr)
|
||||
lin12_df_dr = as.data.frame(cbind(lin12_dr$data.name, lin12_dr$p.value))
|
||||
|
||||
lin13_dr = ks.test(lin1_dr,lin3_dr)
|
||||
lin13_df_dr = as.data.frame(cbind(lin13_dr$data.name, lin13_dr$p.value))
|
||||
|
||||
lin14_dr = ks.test(lin1_dr,lin4_dr)
|
||||
lin14_df_dr = as.data.frame(cbind(lin14_dr$data.name, lin14_dr$p.value))
|
||||
|
||||
lin23_dr = ks.test(lin2_dr,lin3_dr)
|
||||
lin23_df_dr = as.data.frame(cbind(lin23_dr$data.name, lin23_dr$p.value))
|
||||
|
||||
lin24_dr = ks.test(lin2_dr,lin4_dr)
|
||||
lin24_df_dr = as.data.frame(cbind(lin24_dr$data.name, lin24_dr$p.value))
|
||||
|
||||
lin34_dr = ks.test(lin3_dr,lin4_dr)
|
||||
lin34_df_dr = as.data.frame(cbind(lin34_dr$data.name, lin34_dr$p.value))
|
||||
|
||||
ks_results_dr = rbind(lin12_df_dr
|
||||
, lin13_df_dr
|
||||
, lin14_df_dr
|
||||
, lin23_df_dr
|
||||
, lin24_df_dr
|
||||
, lin34_df_dr)
|
||||
|
||||
ks_results_combined = cbind(ks_results_all, ks_results_dr)
|
||||
|
||||
my_colnames = c("Lineage_comparisons"
|
||||
, paste0("All_mutations n=", nrow(df_lin))
|
||||
, paste0("Drug_associated_mutations n=", nrow(df_lin_dr)))
|
||||
my_colnames
|
||||
|
||||
# select the output columns
|
||||
ks_results_combined_f = ks_results_combined[,c(1,2,4)]
|
||||
|
||||
colnames(ks_results_combined_f) = my_colnames
|
||||
ks_results_combined_f
|
||||
|
||||
#=============
|
||||
# write output file
|
||||
#=============
|
||||
ks_results = paste0(outdir,"/results/ks_results.csv")
|
||||
write.csv(ks_results_combined_f, ks_results, row.names = F)
|
||||
|
|
@ -104,7 +104,7 @@ or_df.columns
|
|||
|
||||
#%% snp_info file: master and gene specific ones
|
||||
# gene info
|
||||
info_df2 = pd.read_csv(gene_info, sep = '\t', header = 0) #447, 10
|
||||
info_df2 = pd.read_csv(gene_info, sep = '\t', header = 0) #447, 11
|
||||
#info_df2 = pd.read_csv(gene_info, sep = ',', header = 0) #447 10
|
||||
mis_mut_cover = (info_df2['chromosome_number'].nunique()/info_df2['chromosome_number'].count()) * 100
|
||||
print('*****RESULT*****'
|
||||
|
@ -212,7 +212,7 @@ else:
|
|||
|
||||
#PENDING Jody's reply
|
||||
# !!!!!!!!
|
||||
# drop nan from dfm2_mis as these are not useful
|
||||
# drop nan from dfm2_mis as these are not useful and JP confirmed the same
|
||||
print('Dropping NAs before further processing...')
|
||||
dfm2_mis = dfm2[dfm2['mut_type'].notnull()]
|
||||
# !!!!!!!!
|
||||
|
|
|
@ -1,45 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
#=======================================================================
|
||||
#%% useful links
|
||||
#https://towardsdatascience.com/autoviz-automatically-visualize-any-dataset-ba2691a8b55a
|
||||
#https://pypi.org/project/autoviz/
|
||||
#=======================================================================
|
||||
import os, sys
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import re
|
||||
import argparse
|
||||
from autoviz.AutoViz_Class import AutoViz_Class
|
||||
|
||||
homedir = os.path.expanduser('~')
|
||||
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
||||
#%%============================================================================
|
||||
# variables
|
||||
gene = 'pncA'
|
||||
drug = 'pyrazinamide'
|
||||
|
||||
#%%============================================================================
|
||||
#==============
|
||||
# directories
|
||||
#==============
|
||||
datadir = homedir + '/' + 'git/Data'
|
||||
|
||||
indir = datadir + '/' + drug + '/input'
|
||||
|
||||
outdir = datadir + '/' + drug + '/output'
|
||||
|
||||
#=======
|
||||
# input
|
||||
#=======
|
||||
in_filename_plotting = 'car_design.csv'
|
||||
in_filename_plotting = gene.lower() + '_all_params.csv'
|
||||
infile_plotting = outdir + '/' + in_filename_plotting
|
||||
print('plotting file: ', infile_plotting
|
||||
, '\n============================================================')
|
||||
#=======================================================================
|
||||
plotting_df = pd.read_csv(infile_plotting, sep = ',')
|
||||
#Instantiate the AutoViz class
|
||||
AV = AutoViz_Class()
|
||||
df = AV.AutoViz(infile_plotting)
|
||||
#df2 = AV.AutoViz(plotting_df)
|
||||
plotting_df.columns[~plotting_df.columns.isin(df.columns)]
|
|
@ -1,5 +1,4 @@
|
|||
#!/usr/bin/env Rscript
|
||||
getwd()
|
||||
#!/usr/bin/env Rscript getwd()
|
||||
setwd("~/git/LSHTM_analysis/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
|
@ -19,8 +18,7 @@ getwd()
|
|||
library(ggplot2)
|
||||
library(data.table)
|
||||
source("barplot_colour_function.R")
|
||||
#source("subcols_axis.R")
|
||||
source("subcols_axis_PS.R")
|
||||
source("subcols_axis.R")
|
||||
|
||||
# should return the following dfs, directories and variables
|
||||
# mut_pos_cols
|
||||
|
@ -161,9 +159,7 @@ min(df$duet_scaled)
|
|||
max(df$duet_scaled)
|
||||
|
||||
# sanity checks
|
||||
# very important!!!!
|
||||
tapply(df$duet_scaled, df$duet_outcome, min)
|
||||
|
||||
tapply(df$duet_scaled, df$duet_outcome, max)
|
||||
|
||||
# My colour FUNCTION: based on group and subgroup
|
||||
|
@ -241,7 +237,7 @@ outPlot = g +
|
|||
, axis.ticks.x = element_blank()) +
|
||||
labs(title = ""
|
||||
#title = my_title
|
||||
, x = "position"
|
||||
, x = "Position"
|
||||
, y = "Frequency")
|
||||
|
||||
print(outPlot)
|
||||
|
|
|
@ -1,87 +0,0 @@
|
|||
,x,,changes,
|
||||
1,mutationinformation,,Mutationinformation,
|
||||
2,wild_type,,,consider...wild_aa
|
||||
3,position,,Position,
|
||||
4,mutant_type,,,consider...mutant_aa
|
||||
5,chain,,,
|
||||
6,ligand_id,,,
|
||||
7,ligand_distance,,,
|
||||
8,duet_stability_change,,,
|
||||
9,duet_outcome,,DUET_outcome,
|
||||
10,ligand_affinity_change,,,
|
||||
11,ligand_outcome,,Lig_outcome,
|
||||
12,duet_scaled,,ratioDUET,
|
||||
13,affinity_scaled,,ratioPredAff,
|
||||
14,wild_pos,,WildPos,
|
||||
15,wild_chain_pos,,,
|
||||
16,ddg,,,
|
||||
17,contacts,,,
|
||||
18,electro_rr,,,
|
||||
19,electro_mm,,,
|
||||
20,electro_sm,,,
|
||||
21,electro_ss,,,
|
||||
22,disulfide_rr,,,
|
||||
23,disulfide_mm,,,
|
||||
24,disulfide_sm,,,
|
||||
25,disulfide_ss,,,
|
||||
26,hbonds_rr,,,
|
||||
27,hbonds_mm,,,
|
||||
28,hbonds_sm,,,
|
||||
29,hbonds_ss,,,
|
||||
30,partcov_rr,,,
|
||||
31,partcov_mm,,,
|
||||
32,partcov_sm,,,
|
||||
33,partcov_ss,,,
|
||||
34,vdwclashes_rr,,,
|
||||
35,vdwclashes_mm,,,
|
||||
36,vdwclashes_sm,,,
|
||||
37,vdwclashes_ss,,,
|
||||
38,volumetric_rr,,,
|
||||
39,volumetric_mm,,,
|
||||
40,volumetric_sm,,,
|
||||
41,volumetric_ss,,,
|
||||
42,wild_type_dssp,,,
|
||||
43,asa,,,
|
||||
44,rsa,,,
|
||||
45,ss,,,
|
||||
46,ss_class,,,
|
||||
47,chain_id,,,
|
||||
48,wild_type_kd,,,
|
||||
49,kd_values,,,
|
||||
50,rd_values,,,
|
||||
51,wt_3letter_caps,,,
|
||||
52,mutation,,,
|
||||
53,af,,,
|
||||
54,beta_logistic,,,
|
||||
55,or_logistic,,,
|
||||
56,pval_logistic,,,
|
||||
57,se_logistic,,,
|
||||
58,zval_logistic,,,
|
||||
59,ci_low_logistic,,,
|
||||
60,ci_hi_logistic,,,
|
||||
61,or_mychisq,,,
|
||||
62,or_fisher,,,
|
||||
63,pval_fisher,,,
|
||||
64,ci_low_fisher,,,
|
||||
65,ci_hi_fisher,,,
|
||||
66,est_chisq,,,
|
||||
67,pval_chisq,,,
|
||||
68,chromosome_number,,,
|
||||
69,ref_allele,,,
|
||||
70,alt_allele,,,
|
||||
71,mut_type,,,
|
||||
72,gene_id,,,
|
||||
73,gene_number,,,
|
||||
74,mut_region,,,
|
||||
75,mut_info,,,
|
||||
76,chr_num_allele,,,
|
||||
77,wt_3let,,,
|
||||
78,mt_3let,,,
|
||||
79,af_kin,,,
|
||||
80,or_kin,,,
|
||||
81,pwald_kin,,,
|
||||
82,beta_kin,,,
|
||||
83,se_kin,,,
|
||||
84,logl_h1_kin,,,
|
||||
85,l_remle_kin,,,
|
||||
86,n_miss,,,
|
|
|
@ -6,6 +6,7 @@
|
|||
# 2) <gene>_meta_data.csv
|
||||
|
||||
# Output:
|
||||
# 1) muts with opposite effects on stability
|
||||
# 2) large combined df including NAs for AF, OR,etc
|
||||
# Dim: same no. of rows as gene associated meta_data_with_AFandOR
|
||||
# 3) small combined df including NAs for AF, OR, etc.
|
||||
|
@ -36,17 +37,23 @@ source("plotting_data.R")
|
|||
# dup_muts
|
||||
|
||||
cat("Directories imported:"
|
||||
, "\n===================="
|
||||
, "\ndatadir:", datadir
|
||||
, "\nindir:", indir
|
||||
, "\noutdir:", outdir
|
||||
, "\nplotdir:", plotdir)
|
||||
|
||||
cat("Variables imported:"
|
||||
, "\n====================="
|
||||
, "\ndrug:", drug
|
||||
, "\ngene:", gene
|
||||
, "\ngene_match:", gene_match
|
||||
, "\nLength of upos:", length(upos)
|
||||
, "\nAngstrom symbol:", angstroms_symbol)
|
||||
, "\nAngstrom symbol:", angstroms_symbol
|
||||
, "\nNo. of duplicated muts:", dup_muts_nu
|
||||
, "\ndr_muts_col:", dr_muts_col
|
||||
, "\nother_muts_col:", other_muts_col
|
||||
, "\ndrtype_col:", resistance_col)
|
||||
|
||||
|
||||
# clear excess variable
|
||||
rm(my_df, upos, dup_muts)
|
||||
|
@ -57,7 +64,6 @@ rm(my_df, upos, dup_muts)
|
|||
#in_file1: output of plotting_data.R
|
||||
# my_df_u
|
||||
|
||||
|
||||
# infile 2: gene associated meta data
|
||||
#in_filename_gene_metadata = paste0(tolower(gene), "_meta_data_with_AFandOR.csv")
|
||||
in_filename_gene_metadata = paste0(tolower(gene), "_metadata.csv")
|
||||
|
@ -94,6 +100,8 @@ gene_metadata <- read.csv(infile_gene_metadata
|
|||
, header = T)
|
||||
cat("Dim:", dim(gene_metadata))
|
||||
|
||||
table(gene_metadata$mutation_info)
|
||||
|
||||
|
||||
# counting NAs in AF, OR cols
|
||||
# or_mychisq
|
||||
|
@ -124,6 +132,7 @@ if (identical(sum(is.na(my_df_u$or_kin))
|
|||
, "\nNA in AF:", sum(is.na(my_df_u$af_kin)))
|
||||
}
|
||||
|
||||
str(gene_metadata)
|
||||
|
||||
###################################################################
|
||||
# combining: PS
|
||||
|
@ -145,7 +154,7 @@ merging_cols = intersect(colnames(my_df_u), colnames(gene_metadata))
|
|||
|
||||
cat(paste0("Merging dfs with NAs: big df (1-many relationship b/w id & mut)"
|
||||
, "\nNo. of merging cols:", length(merging_cols)
|
||||
, "\nMerging columns identified:\n"))
|
||||
, "\nMerging columns identified:"))
|
||||
print(merging_cols)
|
||||
|
||||
# important checks!
|
||||
|
@ -160,7 +169,7 @@ merged_df2 = merge(x = gene_metadata
|
|||
, by = merging_cols
|
||||
, all.y = T)
|
||||
|
||||
cat("Dim of merged_df2: ", dim(merged_df2), "\n")
|
||||
cat("Dim of merged_df2: ", dim(merged_df2))
|
||||
head(merged_df2$position)
|
||||
|
||||
# sanity check
|
||||
|
@ -170,10 +179,10 @@ if(nrow(gene_metadata) == nrow(merged_df2)){
|
|||
,"\nExpected no. of rows: ",nrow(gene_metadata)
|
||||
,"\nGot no. of rows: ", nrow(merged_df2))
|
||||
} else{
|
||||
cat("\nFAIL: nrow(merged_df2)!= nrow(gene associated gene_metadata)"
|
||||
cat("FAIL: nrow(merged_df2)!= nrow(gene associated gene_metadata)"
|
||||
, "\nExpected no. of rows after merge: ", nrow(gene_metadata)
|
||||
, "\nGot no. of rows: ", nrow(merged_df2)
|
||||
, "\nFinding discrepancy\n")
|
||||
, "\nFinding discrepancy")
|
||||
merged_muts_u = unique(merged_df2$mutationinformation)
|
||||
meta_muts_u = unique(gene_metadata$mutationinformation)
|
||||
# find the index where it differs
|
||||
|
@ -227,6 +236,7 @@ if (identical(sum(is.na(merged_df3$or_kin))
|
|||
, "\nNA in AF:", sum(is.na(merged_df3$af_kin)))
|
||||
}
|
||||
|
||||
|
||||
#=========================
|
||||
# Merge3: merged_df2_comp
|
||||
# same as merge 1 but excluding NAs from ORs, etc.
|
||||
|
@ -237,6 +247,7 @@ cat("Merging dfs without any NAs: big df (1-many relationship b/w id & mut)"
|
|||
|
||||
na_count_df2 = sum(is.na(merged_df2$af))
|
||||
merged_df2_comp = merged_df2[!is.na(merged_df2$af),]
|
||||
|
||||
# sanity check: no +-1 gymnastics
|
||||
cat("Checking nrows in merged_df2_comp")
|
||||
if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){
|
||||
|
@ -246,20 +257,22 @@ if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){
|
|||
, "\nNo. of rows: ", nrow(merged_df2_comp)
|
||||
, "\nNo. of cols: ", ncol(merged_df2_comp))
|
||||
}else{
|
||||
cat("FAIL: No. of rows mismatch"
|
||||
,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
|
||||
,"\nGot no. of rows: ", nrow(merged_df2_comp))
|
||||
cat("FAIL: No. of rows mismatch"
|
||||
,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
|
||||
,"\nGot no. of rows: ", nrow(merged_df2_comp))
|
||||
}
|
||||
|
||||
#=========================
|
||||
# Merge4: merged_df3_comp
|
||||
# same as merge 2 but excluding NAs from ORs, etc or
|
||||
# remove duplicate mutation information
|
||||
#=========================
|
||||
|
||||
na_count_df3 = sum(is.na(merged_df3$af))
|
||||
#merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way
|
||||
|
||||
merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way
|
||||
cat("\nChecking nrows in merged_df3_comp")
|
||||
cat("Checking nrows in merged_df3_comp")
|
||||
|
||||
if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){
|
||||
cat("\nPASS: No. of rows match"
|
||||
,"\nDim of merged_df3_comp: "
|
||||
|
@ -267,12 +280,11 @@ if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){
|
|||
, "\nNo. of rows: ", nrow(merged_df3_comp)
|
||||
, "\nNo. of cols: ", ncol(merged_df3_comp))
|
||||
}else{
|
||||
cat("\nFAIL: No. of rows mismatch"
|
||||
cat("FAIL: No. of rows mismatch"
|
||||
,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
|
||||
, "\nGot no. of rows: ", nrow(merged_df3_comp))
|
||||
,"\nGot no. of rows: ", nrow(merged_df3_comp))
|
||||
}
|
||||
|
||||
|
||||
# alternate way of deriving merged_df3_comp
|
||||
foo = merged_df3[!is.na(merged_df3$af),]
|
||||
bar = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),]
|
||||
|
@ -307,7 +319,8 @@ all.equal(foo, bar)
|
|||
# clear variables
|
||||
rm(foo, bar, gene_metadata
|
||||
, in_filename_params, infile_params, merging_cols
|
||||
, in_filename_gene_metadata, infile_gene_metadata)
|
||||
, in_filename_gene_metadata, infile_gene_metadata
|
||||
, merged_df2v2, merged_df2v3)
|
||||
#*************************
|
||||
#####################################################################
|
||||
# Combining: LIG
|
||||
|
@ -334,4 +347,4 @@ if (nrow(merged_df3_lig) == nrow(my_df_u_lig)){
|
|||
|
||||
#==========================================================================
|
||||
# end of script
|
||||
##=========================================================================
|
||||
##==========================================================================
|
||||
|
|
0
scripts/plotting/combining_two_df_FIXME.R
Executable file → Normal file
0
scripts/plotting/combining_two_df_FIXME.R
Executable file → Normal file
263
scripts/plotting/corr_plots.R
Normal file
263
scripts/plotting/corr_plots.R
Normal file
|
@ -0,0 +1,263 @@
|
|||
#!/usr/bin/env Rscript
|
||||
#########################################################
|
||||
# TASK: Corr plots for PS and Lig
|
||||
|
||||
# Output: 1 svg
|
||||
|
||||
#=======================================================================
|
||||
# working dir and loading libraries
|
||||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/scripts/plotting/")
|
||||
getwd()
|
||||
|
||||
|
||||
source("Header_TT.R")
|
||||
require(cowplot)
|
||||
source("combining_dfs_plotting.R")
|
||||
# should return the following dfs, directories and variables
|
||||
|
||||
# PS combined:
|
||||
# 1) merged_df2
|
||||
# 2) merged_df2_comp
|
||||
# 3) merged_df3
|
||||
# 4) merged_df3_comp
|
||||
|
||||
# LIG combined:
|
||||
# 5) merged_df2_lig
|
||||
# 6) merged_df2_comp_lig
|
||||
# 7) merged_df3_lig
|
||||
# 8) merged_df3_comp_lig
|
||||
|
||||
# 9) my_df_u
|
||||
# 10) my_df_u_lig
|
||||
|
||||
cat(paste0("Directories imported:"
|
||||
, "\ndatadir:", datadir
|
||||
, "\nindir:", indir
|
||||
, "\noutdir:", outdir
|
||||
, "\nplotdir:", plotdir))
|
||||
|
||||
cat(paste0("Variables imported:"
|
||||
, "\ndrug:", drug
|
||||
, "\ngene:", gene
|
||||
, "\ngene_match:", gene_match
|
||||
, "\nAngstrom symbol:", angstroms_symbol
|
||||
, "\nNo. of duplicated muts:", dup_muts_nu
|
||||
, "\nNA count for ORs:", na_count
|
||||
, "\nNA count in df2:", na_count_df2
|
||||
, "\nNA count in df3:", na_count_df3))
|
||||
|
||||
#=======
|
||||
# output
|
||||
#=======
|
||||
# can't combine by cowplot because not ggplots
|
||||
#corr_plot_combined = "corr_combined.svg"
|
||||
#plot_corr_plot_combined = paste0(plotdir,"/", corr_plot_combined)
|
||||
|
||||
# PS
|
||||
corr_ps = "corr_PS.svg"
|
||||
plot_corr_ps = paste0(plotdir,"/", corr_ps)
|
||||
|
||||
# LIG
|
||||
corr_lig = "corr_LIG.svg"
|
||||
plot_corr_lig = paste0(plotdir,"/", corr_lig)
|
||||
|
||||
####################################################################
|
||||
# end of loading libraries and functions #
|
||||
########################################################################
|
||||
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
df_ps = merged_df3_comp
|
||||
df_lig = merged_df3_comp_lig
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
rm( merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig, my_df_u, my_df_u_lig)
|
||||
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#===========================
|
||||
# Data for Correlation plots:PS
|
||||
#===========================
|
||||
table(df_ps$duet_outcome)
|
||||
|
||||
# adding log cols
|
||||
df_ps$log10_or_mychisq = log10(df_ps$or_mychisq)
|
||||
df_ps$neglog_pval_fisher = -log10(df_ps$pval_fisher)
|
||||
|
||||
df_ps$log10_or_kin = log10(df_ps$or_kin)
|
||||
df_ps$neglog_pwald_kin = -log10(df_ps$pwald_kin)
|
||||
|
||||
# subset data to generate pairwise correlations
|
||||
cols_to_select = c("duet_scaled"
|
||||
|
||||
, "log10_or_mychisq"
|
||||
, "neglog_pval_fisher"
|
||||
|
||||
, "log10_or_kin"
|
||||
, "neglog_pwald_kin"
|
||||
|
||||
, "af"
|
||||
|
||||
, "duet_outcome"
|
||||
, drug)
|
||||
|
||||
corr_data_ps = df_ps[, cols_to_select]
|
||||
|
||||
dim(corr_data_ps)
|
||||
|
||||
#p_italic = substitute(paste("-Log(", italic('P'), ")"));p_italic
|
||||
#p_adjusted_italic = substitute(paste("-Log(", italic('P adjusted'), ")"));p_adjusted_italic
|
||||
|
||||
# assign nice colnames (for display)
|
||||
my_corr_colnames = c("DUET"
|
||||
|
||||
, "Log(OR)"
|
||||
, "-Log(P)"
|
||||
|
||||
, "Log(OR adjusted)"
|
||||
, "-Log(P wald)"
|
||||
|
||||
, "AF"
|
||||
|
||||
, "duet_outcome"
|
||||
, drug)
|
||||
|
||||
length(my_corr_colnames)
|
||||
|
||||
colnames(corr_data_ps)
|
||||
colnames(corr_data_ps) <- my_corr_colnames
|
||||
colnames(corr_data_ps)
|
||||
|
||||
#-----------------
|
||||
# generate corr PS plot
|
||||
#-----------------
|
||||
start = 1
|
||||
end = which(colnames(corr_data_ps) == drug); end # should be the last column
|
||||
offset = 1
|
||||
|
||||
my_corr_ps = corr_data_ps[start:(end-offset)]
|
||||
head(my_corr_ps)
|
||||
|
||||
#my_cols = c("#f8766d", "#00bfc4")
|
||||
# deep blue :#007d85
|
||||
# deep red: #ae301e
|
||||
|
||||
cat("Corr plot PS:", plot_corr_ps)
|
||||
svg(plot_corr_ps, width = 15, height = 15)
|
||||
|
||||
OutPlot1 = pairs.panels(my_corr_ps[1:(length(my_corr_ps)-1)]
|
||||
, method = "spearman" # correlation method
|
||||
, hist.col = "grey" ##00AFBB
|
||||
, density = TRUE # show density plots
|
||||
, ellipses = F # show correlation ellipses
|
||||
, stars = T
|
||||
, rug = F
|
||||
, breaks = "Sturges"
|
||||
, show.points = T
|
||||
, bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr_ps$duet_outcome))]
|
||||
, pch = 21
|
||||
, jitter = T
|
||||
#, alpha = .05
|
||||
#, points(pch = 19, col = c("#f8766d", "#00bfc4"))
|
||||
, cex = 3
|
||||
, cex.axis = 2.5
|
||||
, cex.labels = 2.1
|
||||
, cex.cor = 1
|
||||
, smooth = F
|
||||
)
|
||||
|
||||
print(OutPlot1)
|
||||
dev.off()
|
||||
|
||||
#===========================
|
||||
# Data for Correlation plots: LIG
|
||||
#===========================
|
||||
table(df_lig$ligand_outcome)
|
||||
|
||||
df_lig$log10_or_mychisq = log10(df_lig$or_mychisq)
|
||||
df_lig$neglog_pval_fisher = -log10(df_lig$pval_fisher)
|
||||
|
||||
|
||||
df_lig$log10_or_kin = log10(df_lig$or_kin)
|
||||
df_lig$neglog_pwald_kin = -log10(df_lig$pwald_kin)
|
||||
|
||||
|
||||
# subset data to generate pairwise correlations
|
||||
cols_to_select = c("affinity_scaled"
|
||||
|
||||
, "log10_or_mychisq"
|
||||
, "neglog_pval_fisher"
|
||||
|
||||
, "log10_or_kin"
|
||||
, "neglog_pwald_kin"
|
||||
|
||||
, "af"
|
||||
|
||||
, "ligand_outcome"
|
||||
, drug)
|
||||
|
||||
corr_data_lig = df_lig[, cols_to_select]
|
||||
|
||||
|
||||
dim(corr_data_lig)
|
||||
|
||||
# assign nice colnames (for display)
|
||||
my_corr_colnames = c("Ligand Affinity"
|
||||
|
||||
, "Log(OR)"
|
||||
, "-Log(P)"
|
||||
|
||||
, "Log(OR adjusted)"
|
||||
, "-Log(P wald)"
|
||||
|
||||
, "AF"
|
||||
|
||||
, "ligand_outcome"
|
||||
, drug)
|
||||
|
||||
length(my_corr_colnames)
|
||||
|
||||
colnames(corr_data_lig)
|
||||
colnames(corr_data_lig) <- my_corr_colnames
|
||||
colnames(corr_data_lig)
|
||||
|
||||
#-----------------
|
||||
# generate corr LIG plot
|
||||
#-----------------
|
||||
|
||||
start = 1
|
||||
end = which(colnames(corr_data_lig) == drug); end # should be the last column
|
||||
offset = 1
|
||||
|
||||
my_corr_lig = corr_data_lig[start:(end-offset)]
|
||||
head(my_corr_lig)
|
||||
|
||||
cat("Corr LIG plot:", plot_corr_lig)
|
||||
svg(plot_corr_lig, width = 15, height = 15)
|
||||
|
||||
OutPlot2 = pairs.panels(my_corr_lig[1:(length(my_corr_lig)-1)]
|
||||
, method = "spearman" # correlation method
|
||||
, hist.col = "grey" ##00AFBB
|
||||
, density = TRUE # show density plots
|
||||
, ellipses = F # show correlation ellipses
|
||||
, stars = T
|
||||
, rug = F
|
||||
, breaks = "Sturges"
|
||||
, show.points = T
|
||||
, bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr_lig$ligand_outcome))]
|
||||
, pch = 21
|
||||
, jitter = T
|
||||
#, alpha = .05
|
||||
#, points(pch = 19, col = c("#f8766d", "#00bfc4"))
|
||||
, cex = 3
|
||||
, cex.axis = 2.5
|
||||
, cex.labels = 2.1
|
||||
, cex.cor = 1
|
||||
, smooth = F
|
||||
)
|
||||
|
||||
print(OutPlot2)
|
||||
dev.off()
|
||||
#######################################################
|
56
scripts/plotting/dirs.R
Normal file
56
scripts/plotting/dirs.R
Normal file
|
@ -0,0 +1,56 @@
|
|||
#!/usr/bin/env Rscript
|
||||
#########################################################
|
||||
# TASK: formatting data that will be used for various plots
|
||||
|
||||
# useful links
|
||||
#https://stackoverflow.com/questions/38851592/r-append-column-in-a-dataframe-with-frequency-count-based-on-two-columns
|
||||
#########################################################
|
||||
# working dir and loading libraries
|
||||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
#source("Header_TT.R")
|
||||
library(ggplot2)
|
||||
library(data.table)
|
||||
library(dplyr)
|
||||
|
||||
require("getopt", quietly = TRUE) #cmd parse arguments
|
||||
#========================================================
|
||||
# command line args
|
||||
#spec = matrix(c(
|
||||
# "drug" , "d", 1, "character",
|
||||
# "gene" , "g", 1, "character"
|
||||
#), byrow = TRUE, ncol = 4)
|
||||
|
||||
#opt = getopt(spec)
|
||||
|
||||
#drug = opt$druggene = opt$gene
|
||||
|
||||
#if(is.null(drug)|is.null(gene)) {
|
||||
# stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
|
||||
#}
|
||||
#========================================================
|
||||
# FIXME: change to cmd
|
||||
#%% variable assignment: input and output paths & filenames
|
||||
drug = "pyrazinamide"
|
||||
gene = "pncA"
|
||||
gene_match = paste0(gene,"_p.")
|
||||
cat(gene_match)
|
||||
|
||||
#=============
|
||||
# directories and variables
|
||||
#=============
|
||||
datadir = paste0("~/git/Data")
|
||||
indir = paste0(datadir, "/", drug, "/input")
|
||||
outdir = paste0("~/git/Data", "/", drug, "/output")
|
||||
plotdir = paste0("~/git/Data", "/", drug, "/output/plots")
|
||||
|
||||
dr_muts_col = paste0('dr_mutations_', drug)
|
||||
other_muts_col = paste0('other_mutations_', drug)
|
||||
resistance_col = "drtype"
|
||||
|
||||
#%%===============================================================
|
||||
|
||||
|
||||
|
214
scripts/plotting/lineage_basic_barplot.R
Normal file
214
scripts/plotting/lineage_basic_barplot.R
Normal file
|
@ -0,0 +1,214 @@
|
|||
#!/usr/bin/env Rscript
|
||||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/scripts/plotting/")
|
||||
getwd()
|
||||
#########################################################
|
||||
# TASK: Basic lineage barplot showing numbers
|
||||
|
||||
# Output: Basic barplot with lineage samples and mut count
|
||||
|
||||
##########################################################
|
||||
# Installing and loading required packages
|
||||
##########################################################
|
||||
source("Header_TT.R")
|
||||
require(data.table)
|
||||
source("combining_dfs_plotting.R")
|
||||
# should return the following dfs, directories and variables
|
||||
|
||||
# PS combined:
|
||||
# 1) merged_df2
|
||||
# 2) merged_df2_comp
|
||||
# 3) merged_df3
|
||||
# 4) merged_df3_comp
|
||||
|
||||
# LIG combined:
|
||||
# 5) merged_df2_lig
|
||||
# 6) merged_df2_comp_lig
|
||||
# 7) merged_df3_lig
|
||||
# 8) merged_df3_comp_lig
|
||||
|
||||
# 9) my_df_u
|
||||
# 10) my_df_u_lig
|
||||
|
||||
cat("Directories imported:"
|
||||
, "\n===================="
|
||||
, "\ndatadir:", datadir
|
||||
, "\nindir:", indir
|
||||
, "\noutdir:", outdir
|
||||
, "\nplotdir:", plotdir)
|
||||
|
||||
cat("Variables imported:"
|
||||
, "\n====================="
|
||||
, "\ndrug:", drug
|
||||
, "\ngene:", gene
|
||||
, "\ngene_match:", gene_match
|
||||
, "\nAngstrom symbol:", angstroms_symbol
|
||||
, "\nNo. of duplicated muts:", dup_muts_nu
|
||||
, "\nNA count for ORs:", na_count
|
||||
, "\nNA count in df2:", na_count_df2
|
||||
, "\nNA count in df3:", na_count_df3
|
||||
, "\ndr_muts_col:", dr_muts_col
|
||||
, "\nother_muts_col:", other_muts_col
|
||||
, "\ndrtype_col:", resistance_col)
|
||||
|
||||
|
||||
#===========
|
||||
# input
|
||||
#===========
|
||||
# output of combining_dfs_plotting.R
|
||||
|
||||
#=======
|
||||
# output
|
||||
#=======
|
||||
# plot 1
|
||||
basic_bp_lineage = "basic_lineage_barplot.svg"
|
||||
plot_basic_bp_lineage = paste0(plotdir,"/", basic_bp_lineage)
|
||||
|
||||
#=======================================================================
|
||||
#================
|
||||
# Data for plots:
|
||||
# you need merged_df2, comprehensive one
|
||||
# since this has one-many relationship
|
||||
# i.e the same SNP can belong to multiple lineages
|
||||
#================
|
||||
# REASSIGNMENT as necessary
|
||||
my_df = merged_df2
|
||||
|
||||
# clear excess variable
|
||||
rm(merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
is.factor(my_df$lineage)
|
||||
my_df$lineage = as.factor(my_df$lineage)
|
||||
is.factor(my_df$lineage)
|
||||
|
||||
#==========================
|
||||
# Plot: Lineage barplot
|
||||
# x = lineage y = No. of samples
|
||||
# col = Lineage
|
||||
# fill = lineage
|
||||
#============================
|
||||
table(my_df$lineage)
|
||||
as.data.frame(table(my_df$lineage))
|
||||
|
||||
#=============
|
||||
# Data for plots
|
||||
#=============
|
||||
# REASSIGNMENT
|
||||
df <- my_df
|
||||
|
||||
rm(my_df)
|
||||
|
||||
# get freq count of positions so you can subset freq<1
|
||||
#setDT(df)[, lineage_count := .N, by = .(lineage)]
|
||||
|
||||
#******************
|
||||
# generate plot: barplot of mutation by lineage
|
||||
#******************
|
||||
sel_lineages = c("lineage1"
|
||||
, "lineage2"
|
||||
, "lineage3"
|
||||
, "lineage4"
|
||||
#, "lineage5"
|
||||
#, "lineage6"
|
||||
#, "lineage7"
|
||||
)
|
||||
|
||||
df_lin = subset(df, subset = lineage %in% sel_lineages)
|
||||
|
||||
# Create df with lineage inform & no. of unique mutations
|
||||
# per lineage and total samples within lineage
|
||||
# this is essentially barplot with two y axis
|
||||
|
||||
bar = bar = as.data.frame(sel_lineages) #4, 1
|
||||
total_snps_u = NULL
|
||||
total_samples = NULL
|
||||
|
||||
for (i in sel_lineages){
|
||||
#print(i)
|
||||
curr_total = length(unique(df$id)[df$lineage==i])
|
||||
total_samples = c(total_samples, curr_total)
|
||||
print(total_samples)
|
||||
|
||||
foo = df[df$lineage==i,]
|
||||
print(paste0(i, "======="))
|
||||
print(length(unique(foo$mutationinformation)))
|
||||
curr_count = length(unique(foo$mutationinformation))
|
||||
|
||||
total_snps_u = c(total_snps_u, curr_count)
|
||||
}
|
||||
|
||||
print(total_snps_u)
|
||||
bar$num_snps_u = total_snps_u
|
||||
bar$total_samples = total_samples
|
||||
bar
|
||||
|
||||
#*****************
|
||||
# generate plot: lineage barplot with two y-axis
|
||||
#https://stackoverflow.com/questions/13035295/overlay-bar-graphs-in-ggplot2
|
||||
#*****************
|
||||
|
||||
y1 = bar$num_snps_u
|
||||
y2 = bar$total_samples
|
||||
x = sel_lineages
|
||||
|
||||
to_plot = data.frame(x = x
|
||||
, y1 = y1
|
||||
, y2 = y2)
|
||||
to_plot
|
||||
|
||||
# FIXME later: will be depricated!
|
||||
melted = melt(to_plot, id = "x")
|
||||
melted
|
||||
|
||||
|
||||
svg(plot_basic_bp_lineage)
|
||||
|
||||
my_ats = 20 # axis text size
|
||||
my_als = 22 # axis label size
|
||||
|
||||
g = ggplot(melted, aes(x = x
|
||||
, y = value
|
||||
, fill = variable))
|
||||
|
||||
printFile = g + geom_bar(stat = "identity"
|
||||
, position = position_stack(reverse = TRUE)
|
||||
, alpha=.75
|
||||
, colour='grey75') +
|
||||
theme(axis.text.x = element_text(size = my_ats)
|
||||
, axis.text.y = element_text(size = my_ats
|
||||
#, angle = 30
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_text(size = my_als
|
||||
, colour = 'black')
|
||||
, axis.title.y = element_text(size = my_als
|
||||
, colour = 'black')
|
||||
, legend.position = "top"
|
||||
, legend.text = element_text(size = my_als)) +
|
||||
#geom_text() +
|
||||
geom_label(aes(label = value)
|
||||
, size = 5
|
||||
, hjust = 0.5
|
||||
, vjust = 0.5
|
||||
, colour = 'black'
|
||||
, show.legend = FALSE
|
||||
#, check_overlap = TRUE
|
||||
, position = position_stack(reverse = T)) +
|
||||
labs(title = ''
|
||||
, x = ''
|
||||
, y = "Number"
|
||||
, fill = 'Variable'
|
||||
, colour = 'black') +
|
||||
scale_fill_manual(values = c('grey50', 'gray75')
|
||||
, name=''
|
||||
, labels=c('Mutations', 'Total Samples')) +
|
||||
scale_x_discrete(breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
|
||||
, labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4'))
|
||||
|
||||
print(printFile)
|
||||
dev.off()
|
301
scripts/plotting/lineage_dist_combined_PS.R
Normal file
301
scripts/plotting/lineage_dist_combined_PS.R
Normal file
|
@ -0,0 +1,301 @@
|
|||
#!/usr/bin/env Rscript
|
||||
#########################################################
|
||||
# TASK: Lineage dist plots: ggridges
|
||||
|
||||
# Output: 2 SVGs for PS stability
|
||||
|
||||
# 1) all muts
|
||||
# 2) dr_muts
|
||||
|
||||
##########################################################
|
||||
# Installing and loading required packages
|
||||
##########################################################
|
||||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/scripts/plotting/")
|
||||
getwd()
|
||||
|
||||
source("Header_TT.R")
|
||||
library(ggridges)
|
||||
source("combining_dfs_plotting.R")
|
||||
# PS combined:
|
||||
# 1) merged_df2
|
||||
# 2) merged_df2_comp
|
||||
# 3) merged_df3
|
||||
# 4) merged_df3_comp
|
||||
|
||||
# LIG combined:
|
||||
# 5) merged_df2_lig
|
||||
# 6) merged_df2_comp_lig
|
||||
# 7) merged_df3_lig
|
||||
# 8) merged_df3_comp_lig
|
||||
|
||||
# 9) my_df_u
|
||||
# 10) my_df_u_lig
|
||||
|
||||
cat("Directories imported:"
|
||||
, "\n===================="
|
||||
, "\ndatadir:", datadir
|
||||
, "\nindir:", indir
|
||||
, "\noutdir:", outdir
|
||||
, "\nplotdir:", plotdir)
|
||||
|
||||
cat("Variables imported:"
|
||||
, "\n====================="
|
||||
, "\ndrug:", drug
|
||||
, "\ngene:", gene
|
||||
, "\ngene_match:", gene_match
|
||||
, "\nAngstrom symbol:", angstroms_symbol
|
||||
, "\nNo. of duplicated muts:", dup_muts_nu
|
||||
, "\nNA count for ORs:", na_count
|
||||
, "\nNA count in df2:", na_count_df2
|
||||
, "\nNA count in df3:", na_count_df3
|
||||
, "\ndr_muts_col:", dr_muts_col
|
||||
, "\nother_muts_col:", other_muts_col
|
||||
, "\ndrtype_col:", resistance_col)
|
||||
|
||||
#=======
|
||||
# output
|
||||
#=======
|
||||
lineage_dist_combined = "lineage_dist_combined_PS.svg"
|
||||
plot_lineage_dist_combined = paste0(plotdir,"/", lineage_dist_combined)
|
||||
#========================================================================
|
||||
|
||||
###########################
|
||||
# Data for plots
|
||||
# you need merged_df2 or merged_df2_comp
|
||||
# since this is one-many relationship
|
||||
# i.e the same SNP can belong to multiple lineages
|
||||
# using the _comp dataset means
|
||||
# we lose some muts and at this level, we should use
|
||||
# as much info as available, hence use df with NA
|
||||
###########################
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df2
|
||||
|
||||
# delete variables not required
|
||||
rm(my_df_u, merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
is.factor(my_df$lineage)
|
||||
my_df$lineage = as.factor(my_df$lineage)
|
||||
is.factor(my_df$lineage)
|
||||
|
||||
table(my_df$mutation_info)
|
||||
|
||||
# subset df with dr muts only
|
||||
my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide")
|
||||
table(my_df_dr$mutation_info)
|
||||
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#==========================
|
||||
# Plot 1: ALL Muts
|
||||
# x = mcsm_values, y = dist
|
||||
# fill = stability
|
||||
#============================
|
||||
|
||||
my_plot_name = 'lineage_dist_PS.svg'
|
||||
|
||||
plot_lineage_duet = paste0(plotdir,"/", my_plot_name)
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
table(my_df$lineage); str(my_df$lineage)
|
||||
|
||||
# subset only lineages1-4
|
||||
sel_lineages = c("lineage1"
|
||||
, "lineage2"
|
||||
, "lineage3"
|
||||
, "lineage4"
|
||||
#, "lineage5"
|
||||
#, "lineage6"
|
||||
#, "lineage7"
|
||||
)
|
||||
|
||||
# uncomment as necessary
|
||||
df_lin = subset(my_df, subset = lineage %in% sel_lineages )
|
||||
table(df_lin$lineage)
|
||||
|
||||
# refactor
|
||||
df_lin$lineage = factor(df_lin$lineage)
|
||||
|
||||
sum(table(df_lin$lineage)) #{RESULT: Total number of samples for lineage}
|
||||
|
||||
table(df_lin$lineage)#{RESULT: No of samples within lineage}
|
||||
|
||||
length(unique(df_lin$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to}
|
||||
|
||||
length(df_lin$mutationinformation)
|
||||
|
||||
u2 = unique(my_df$mutationinformation)
|
||||
u = unique(df_lin$mutationinformation)
|
||||
check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages}
|
||||
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
# REASSIGNMENT
|
||||
df <- df_lin
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
rm(df_lin)
|
||||
|
||||
#******************
|
||||
# generate distribution plot of lineages
|
||||
#******************
|
||||
# 2 : ggridges (good!)
|
||||
my_ats = 15 # axis text size
|
||||
my_als = 20 # axis label size
|
||||
|
||||
my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4'
|
||||
#, 'Lineage 5', 'Lineage 6', 'Lineage 7'
|
||||
)
|
||||
names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4'
|
||||
# , 'lineage5', 'lineage6', 'lineage7'
|
||||
)
|
||||
# check plot name
|
||||
plot_lineage_duet
|
||||
|
||||
# output svg
|
||||
#svg(plot_lineage_duet)
|
||||
p1 = ggplot(df, aes(x = duet_scaled
|
||||
, y = duet_outcome))+
|
||||
|
||||
#printFile=geom_density_ridges_gradient(
|
||||
geom_density_ridges_gradient(aes(fill = ..x..)
|
||||
, scale = 3
|
||||
, size = 0.3 ) +
|
||||
facet_wrap( ~lineage
|
||||
, scales = "free"
|
||||
#, switch = 'x'
|
||||
, labeller = labeller(lineage = my_labels) ) +
|
||||
coord_cartesian( xlim = c(-1, 1)) +
|
||||
scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
|
||||
, name = "DUET" ) +
|
||||
theme(axis.text.x = element_text(size = my_ats
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0.4)
|
||||
|
||||
, axis.text.y = element_blank()
|
||||
, axis.title.x = element_blank()
|
||||
, axis.title.y = element_blank()
|
||||
, axis.ticks.y = element_blank()
|
||||
, plot.title = element_blank()
|
||||
, strip.text = element_text(size = my_als)
|
||||
, legend.text = element_text(size = my_als-5)
|
||||
, legend.title = element_text(size = my_als)
|
||||
)
|
||||
|
||||
print(p1)
|
||||
#dev.off()
|
||||
|
||||
#######################################################################
|
||||
# lineage distribution plot for dr_muts
|
||||
#######################################################################
|
||||
|
||||
#==========================
|
||||
# Plot 2: dr muts ONLY
|
||||
# x = mcsm_values, y = dist
|
||||
# fill = stability
|
||||
#============================
|
||||
|
||||
my_plot_name_dr = 'lineage_dist_dr_muts_PS.svg'
|
||||
|
||||
plot_lineage_dr_duet = paste0(plotdir,"/", my_plot_name_dr)
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
table(my_df_dr$lineage); str(my_df_dr$lineage)
|
||||
|
||||
# uncomment as necessary
|
||||
df_lin_dr = subset(my_df_dr, subset = lineage %in% sel_lineages)
|
||||
table(df_lin_dr$lineage)
|
||||
|
||||
# refactor
|
||||
df_lin_dr$lineage = factor(df_lin_dr$lineage)
|
||||
|
||||
sum(table(df_lin_dr$lineage)) #{RESULT: Total number of samples for lineage}
|
||||
|
||||
table(df_lin_dr$lineage)#{RESULT: No of samples within lineage}
|
||||
|
||||
length(unique(df_lin_dr$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to}
|
||||
|
||||
length(df_lin_dr$mutationinformation)
|
||||
|
||||
u2 = unique(my_df_dr$mutationinformation)
|
||||
u = unique(df_lin_dr$mutationinformation)
|
||||
check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages}
|
||||
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
# REASSIGNMENT
|
||||
df_dr <- df_lin_dr
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
rm(df_lin_dr)
|
||||
|
||||
#******************
|
||||
# generate distribution plot of lineages
|
||||
#******************
|
||||
# 2 : ggridges (good!)
|
||||
my_ats = 15 # axis text size
|
||||
my_als = 20 # axis label size
|
||||
|
||||
|
||||
# check plot name
|
||||
plot_lineage_dr_duet
|
||||
|
||||
# output svg
|
||||
#svg(plot_lineage_dr_duet)
|
||||
p2 = ggplot(df_dr, aes(x = duet_scaled
|
||||
, y = duet_outcome))+
|
||||
|
||||
#printFile=geom_density_ridges_gradient(
|
||||
geom_density_ridges_gradient(aes(fill = ..x..)
|
||||
, scale = 3
|
||||
, size = 0.3 ) +
|
||||
facet_wrap( ~lineage
|
||||
, scales = "free"
|
||||
#, switch = 'x'
|
||||
, labeller = labeller(lineage = my_labels) ) +
|
||||
coord_cartesian( xlim = c(-1, 1)
|
||||
#, ylim = c(0, 6)
|
||||
#, clip = "off"
|
||||
) +
|
||||
scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
|
||||
, name = "DUET" ) +
|
||||
theme(axis.text.x = element_text(size = my_ats
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0.4)
|
||||
, axis.text.y = element_blank()
|
||||
, axis.title.x = element_blank()
|
||||
, axis.title.y = element_blank()
|
||||
, axis.ticks.y = element_blank()
|
||||
, plot.title = element_blank()
|
||||
, strip.text = element_text(size = my_als)
|
||||
, legend.text = element_text(size = 10)
|
||||
, legend.title = element_text(size = my_als)
|
||||
#, legend.position = "none"
|
||||
)
|
||||
|
||||
print(p2)
|
||||
#dev.off()
|
||||
########################################################################
|
||||
#==============
|
||||
# combine plot
|
||||
#===============
|
||||
|
||||
svg(plot_lineage_dist_combined, width = 12, height = 6)
|
||||
|
||||
printFile = cowplot::plot_grid(p1, p2
|
||||
, label_size = my_als+10)
|
||||
|
||||
print(printFile)
|
||||
dev.off()
|
96
scripts/plotting/opp_mcsm_muts.R
Normal file
96
scripts/plotting/opp_mcsm_muts.R
Normal file
|
@ -0,0 +1,96 @@
|
|||
#!/usr/bin/env Rscript
|
||||
#########################################################
|
||||
# TASK: To write muts with opposite effects on
|
||||
# protomer and ligand stability
|
||||
#########################################################
|
||||
# working dir and loading libraries
|
||||
|
||||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/scripts/plotting/")
|
||||
getwd()
|
||||
|
||||
source("plotting_data.R")
|
||||
|
||||
# should return the following dfs, directories and variables
|
||||
# my_df
|
||||
# my_df_u
|
||||
# my_df_u_lig
|
||||
# dup_muts
|
||||
|
||||
cat(paste0("Directories imported:"
|
||||
, "\ndatadir:", datadir
|
||||
, "\nindir:", indir
|
||||
, "\noutdir:", outdir
|
||||
, "\nplotdir:", plotdir))
|
||||
|
||||
cat(paste0("Variables imported:"
|
||||
, "\ndrug:", drug
|
||||
, "\ngene:", gene
|
||||
, "\ngene_match:", gene_match
|
||||
, "\nLength of upos:", length(upos)
|
||||
, "\nAngstrom symbol:", angstroms_symbol))
|
||||
|
||||
# clear excess variable
|
||||
rm(my_df, upos, dup_muts)
|
||||
#========================================================
|
||||
#===========
|
||||
# input
|
||||
#===========
|
||||
#in_file1: output of plotting_data.R
|
||||
# my_df_u
|
||||
|
||||
#===========
|
||||
# output
|
||||
#===========
|
||||
# mutations with opposite effects
|
||||
out_filename_opp_muts = paste0(tolower(gene), "_muts_opp_effects.csv")
|
||||
outfile_opp_muts = paste0(outdir, "/", out_filename_opp_muts)
|
||||
|
||||
#%%===============================================================
|
||||
|
||||
# spelling Correction 1: DUET incase American spelling needed!
|
||||
table(my_df_u$duet_outcome); sum(table(my_df_u$duet_outcome) )
|
||||
#my_df_u$duet_outcome[my_df_u$duet_outcome=="Stabilising"] <- "Stabilizing"
|
||||
#my_df_u$duet_outcome[my_df_u$duet_outcome=="Destabilising"] <- "Destabilizing"
|
||||
|
||||
|
||||
# spelling Correction 2: Ligand incase American spelling needed!
|
||||
table(my_df_u$ligand_outcome); sum(table(my_df_u$ligand_outcome) )
|
||||
#my_df_u$ligand_outcome[my_df_u$ligand_outcome=="Stabilising"] <- "Stabilizing"
|
||||
#my_df_u$ligand_outcome[my_df_u$ligand_outcome=="Destabilising"] <- "Destabilizing"
|
||||
|
||||
|
||||
# muts with opposing effects on protomer and ligand stability
|
||||
table(my_df_u$duet_outcome != my_df_u$ligand_outcome)
|
||||
changes = my_df_u[which(my_df_u$duet_outcome != my_df_u$ligand_outcome),]
|
||||
|
||||
# sanity check: redundant, but uber cautious!
|
||||
dl_i = which(my_df_u$duet_outcome != my_df_u$ligand_outcome)
|
||||
ld_i = which(my_df_u$ligand_outcome != my_df_u$duet_outcome)
|
||||
|
||||
cat("Identifying muts with opposite stability effects")
|
||||
if(nrow(changes) == (table(my_df_u$duet_outcome != my_df_u$ligand_outcome)[[2]]) & identical(dl_i,ld_i)) {
|
||||
cat("PASS: muts with opposite effects on stability and affinity correctly identified"
|
||||
, "\nNo. of such muts: ", nrow(changes))
|
||||
}else {
|
||||
cat("FAIL: unsuccessful in extracting muts with changed stability effects")
|
||||
}
|
||||
|
||||
#==========================
|
||||
# write file: changed muts
|
||||
#==========================
|
||||
write.csv(changes, outfile_opp_muts)
|
||||
|
||||
cat("Finished writing file for muts with opp effects:"
|
||||
, "\nFilename: ", outfile_opp_muts
|
||||
, "\nDim:", dim(changes))
|
||||
|
||||
# clear variables
|
||||
rm(out_filename_opp_muts, outfile_opp_muts)
|
||||
rm(changes, dl_i, ld_i)
|
||||
|
||||
# count na in each column
|
||||
na_count = sapply(my_df_u, function(y) sum(length(which(is.na(y))))); na_count
|
||||
df_ncols = ncol(my_df_u)
|
||||
|
||||
#===================================== end of script
|
192
scripts/plotting/or_plots_combined.R
Normal file
192
scripts/plotting/or_plots_combined.R
Normal file
|
@ -0,0 +1,192 @@
|
|||
#!/usr/bin/env Rscript
|
||||
#########################################################
|
||||
# TASK: Bubble plot of OR for PS and Lig
|
||||
|
||||
# Output: 1 svg
|
||||
|
||||
#=======================================================================
|
||||
# working dir and loading libraries
|
||||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/scripts/plotting/")
|
||||
getwd()
|
||||
|
||||
|
||||
source("Header_TT.R")
|
||||
require(cowplot)
|
||||
source("combining_dfs_plotting.R")
|
||||
# should return the following dfs, directories and variables
|
||||
|
||||
# PS combined:
|
||||
# 1) merged_df2
|
||||
# 2) merged_df2_comp
|
||||
# 3) merged_df3
|
||||
# 4) merged_df3_comp
|
||||
|
||||
# LIG combined:
|
||||
# 5) merged_df2_lig
|
||||
# 6) merged_df2_comp_lig
|
||||
# 7) merged_df3_lig
|
||||
# 8) merged_df3_comp_lig
|
||||
|
||||
# 9) my_df_u
|
||||
# 10) my_df_u_lig
|
||||
|
||||
cat(paste0("Directories imported:"
|
||||
, "\ndatadir:", datadir
|
||||
, "\nindir:", indir
|
||||
, "\noutdir:", outdir
|
||||
, "\nplotdir:", plotdir))
|
||||
|
||||
cat(paste0("Variables imported:"
|
||||
, "\ndrug:", drug
|
||||
, "\ngene:", gene
|
||||
, "\ngene_match:", gene_match
|
||||
, "\nAngstrom symbol:", angstroms_symbol
|
||||
, "\nNo. of duplicated muts:", dup_muts_nu
|
||||
, "\nNA count for ORs:", na_count
|
||||
, "\nNA count in df2:", na_count_df2
|
||||
, "\nNA count in df3:", na_count_df3))
|
||||
|
||||
#=========================
|
||||
#=======
|
||||
# output
|
||||
#=======
|
||||
or_combined = "or_combined_PS_LIG.svg"
|
||||
plot_or_combined = paste0(plotdir,"/", or_combined)
|
||||
|
||||
#or_kin_combined = "or_kin_combined_PS_LIG.svg"
|
||||
#plot_or_kin_combined = paste0(plotdir,"/", or_kin_combined)
|
||||
#=======================================================================
|
||||
|
||||
###########################
|
||||
# Data for OR and stability plots
|
||||
# you need merged_df3_comp
|
||||
# since these are matched
|
||||
# to allow pairwise corr
|
||||
###########################
|
||||
|
||||
ps_df = merged_df3_comp
|
||||
lig_df = merged_df3_comp_lig
|
||||
|
||||
# Ensure correct data type in columns to plot: should be TRUE
|
||||
is.numeric(ps_df$or_mychisq)
|
||||
is.numeric(lig_df$or_mychisq)
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig, my_df_u, my_df_u_lig)
|
||||
#%% end of section 1
|
||||
|
||||
# sanity check: should be <10
|
||||
if (max(lig_df$ligand_distance) < 10){
|
||||
print ("Sanity check passed: lig data is <10Ang")
|
||||
}else{
|
||||
print ("Error: data should be filtered to be within 10Ang")
|
||||
}
|
||||
|
||||
#############
|
||||
# Plots: Bubble plot
|
||||
# x = Position, Y = stability
|
||||
# size of dots = OR
|
||||
# col: stability
|
||||
#############
|
||||
|
||||
#-----------------
|
||||
# Plot 1: DUET vs OR by position as geom_points
|
||||
#-------------------
|
||||
|
||||
my_ats = 20 # axis text size
|
||||
my_als = 22 # axis label size
|
||||
|
||||
# Spelling Correction: made redundant as already corrected at the source
|
||||
#ps_df$duet_outcome[ps_df$duet_outcome=='Stabilizing'] <- 'Stabilising'
|
||||
#ps_df$duet_outcome[ps_df$duet_outcome=='Destabilizing'] <- 'Destabilising'
|
||||
|
||||
table(ps_df$duet_outcome) ; sum(table(ps_df$duet_outcome))
|
||||
|
||||
g1 = ggplot(ps_df, aes(x = factor(position)
|
||||
, y = duet_scaled))
|
||||
|
||||
p1 = g1 +
|
||||
geom_point(aes(col = duet_outcome
|
||||
, size = or_mychisq))+
|
||||
#, size = or_kin)) + # not good, almost like log(or)
|
||||
theme(axis.text.x = element_text(size = my_ats
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0.4)
|
||||
, axis.text.y = element_text(size = my_ats
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_text(size = my_als)
|
||||
, axis.title.y = element_text(size = my_als)
|
||||
, legend.text = element_text(size = my_als)
|
||||
, legend.title = element_text(size = my_als) ) +
|
||||
#, legend.key.size = unit(1, "cm")) +
|
||||
labs(title = ""
|
||||
, x = "Position"
|
||||
, y = "DUET(PS)"
|
||||
, size = "Odds Ratio"
|
||||
, colour = "DUET Outcome") +
|
||||
guides(colour = guide_legend(override.aes = list(size=4)))
|
||||
|
||||
p1
|
||||
|
||||
#-------------------
|
||||
# generate plot 2: Lig vs OR by position as geom_points
|
||||
#-------------------
|
||||
|
||||
# Spelling Correction: made redundant as already corrected at the source
|
||||
|
||||
#lig_df$ligand_outcome[lig_df$ligand_outcome=='Stabilizing'] <- 'Stabilising'
|
||||
#lig_df$ligand_outcome[lig_df$ligand_outcome=='Destabilizing'] <- 'Destabilising'
|
||||
|
||||
table(lig_df$ligand_outcome)
|
||||
|
||||
g2 = ggplot(lig_df, aes(x = factor(position)
|
||||
, y = affinity_scaled))
|
||||
|
||||
p2 = g2 +
|
||||
geom_point(aes(col = ligand_outcome
|
||||
, size = or_mychisq))+
|
||||
#, size = or_kin)) + # not good! almost like log(or)
|
||||
theme(axis.text.x = element_text(size = my_ats
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0.4)
|
||||
, axis.text.y = element_text(size = my_ats
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_text(size = my_als)
|
||||
, axis.title.y = element_text(size = my_als)
|
||||
, legend.text = element_text(size = my_als)
|
||||
, legend.title = element_text(size = my_als) ) +
|
||||
#, legend.key.size = unit(1, "cm")) +
|
||||
labs(title = ""
|
||||
, x = "Position"
|
||||
, y = "Ligand Affinity"
|
||||
, size = "Odds Ratio"
|
||||
, colour = "Ligand Outcome"
|
||||
) +
|
||||
guides(colour = guide_legend(override.aes = list(size=4)))
|
||||
|
||||
p2
|
||||
|
||||
#======================
|
||||
# combine using cowplot
|
||||
#======================
|
||||
|
||||
svg(plot_or_combined, width = 32, height = 12)
|
||||
#svg(plot_or_kin_combined, width = 32, height = 12)
|
||||
|
||||
theme_set(theme_gray()) # to preserve default theme
|
||||
|
||||
printFile = cowplot::plot_grid(plot_grid(p1, p2
|
||||
, ncol = 1
|
||||
, align = 'v'
|
||||
, labels = c("", "")
|
||||
, label_size = my_als+5))
|
||||
print(printFile)
|
||||
dev.off()
|
||||
|
|
@ -15,7 +15,10 @@ library(ggplot2)
|
|||
library(data.table)
|
||||
library(dplyr)
|
||||
|
||||
source("dirs.R")
|
||||
|
||||
require("getopt", quietly = TRUE) #cmd parse arguments
|
||||
|
||||
#========================================================
|
||||
# command line args
|
||||
#spec = matrix(c(
|
||||
|
@ -31,19 +34,6 @@ require("getopt", quietly = TRUE) #cmd parse arguments
|
|||
# stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
|
||||
#}
|
||||
#========================================================
|
||||
#%% variable assignment: input and output paths & filenames
|
||||
drug = "pyrazinamide"
|
||||
gene = "pncA"
|
||||
gene_match = paste0(gene,"_p.")
|
||||
cat(gene_match)
|
||||
|
||||
#=============
|
||||
# directories
|
||||
#=============
|
||||
datadir = paste0("~/git/Data")
|
||||
indir = paste0(datadir, "/", drug, "/input")
|
||||
outdir = paste0("~/git/Data", "/", drug, "/output")
|
||||
plotdir = paste0("~/git/Data", "/", drug, "/output/plots")
|
||||
#======
|
||||
# input
|
||||
#======
|
||||
|
@ -52,6 +42,17 @@ in_filename_params = paste0(tolower(gene), "_all_params.csv")
|
|||
infile_params = paste0(outdir, "/", in_filename_params)
|
||||
cat(paste0("Input file 1:", infile_params) )
|
||||
|
||||
|
||||
cat('columns based on variables:\n'
|
||||
, drug
|
||||
, '\n'
|
||||
, dr_muts_col
|
||||
, '\n'
|
||||
, other_muts_col
|
||||
, "\n"
|
||||
, resistance_col
|
||||
, '\n===============================================================')
|
||||
|
||||
#%%===============================================================
|
||||
###########################
|
||||
# Read file: struct params
|
||||
|
@ -62,9 +63,6 @@ my_df = read.csv(infile_params, header = T)
|
|||
|
||||
cat("\nInput dimensions:", dim(my_df))
|
||||
|
||||
# quick checks
|
||||
#colnames(my_df)
|
||||
#str(my_df)
|
||||
|
||||
###########################
|
||||
# extract unique mutation entries
|
||||
|
|
251
scripts/plotting/ps_plots_combined.R
Normal file
251
scripts/plotting/ps_plots_combined.R
Normal file
|
@ -0,0 +1,251 @@
|
|||
#!/usr/bin/env Rscript
|
||||
#########################################################
|
||||
# TASK: AF, OR and stability plots: PS
|
||||
# Output: 1 SVG
|
||||
|
||||
#########################################################
|
||||
# Installing and loading required packages
|
||||
##########################################################
|
||||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/scripts/plotting/")
|
||||
getwd()
|
||||
|
||||
source("Header_TT.R")
|
||||
library(ggplot2)
|
||||
library(data.table)
|
||||
#source("combining_dfs_plotting.R")
|
||||
source("barplot_colour_function.R")
|
||||
source("subcols_axis.R")
|
||||
|
||||
###########################
|
||||
# Data for PS plots
|
||||
# you need merged_df3_comp/merged_df_comp
|
||||
# since these have unique SNPs
|
||||
###########################
|
||||
|
||||
no_or_index = which(is.na(my_df_u_cols$or_mychisq))
|
||||
|
||||
my_df_u_cols_clean = my_df_u_cols[-no_or_index,]
|
||||
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
# REASSIGNMENT
|
||||
df = my_df_u_cols_clean
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
cols_to_select = colnames(mut_pos_cols)
|
||||
|
||||
mut_pos_cols_clean = df[colnames(df)%in%cols_to_select]
|
||||
mut_pos_cols_clean = unique(mut_pos_cols_clean[, 1:length(mut_pos_cols_clean)])
|
||||
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#===========================
|
||||
# Barplot with axis colours:
|
||||
#===========================
|
||||
#================
|
||||
# Inspecting mut_pos_cols
|
||||
# position numbers and colours and assigning axis colours based on lab_fg
|
||||
# of the correct df
|
||||
# open file from desktop ("sample_axis_cols") for cross checking
|
||||
#================
|
||||
# very important!
|
||||
#my_axis_colours = mut_pos_cols$lab_fg
|
||||
|
||||
if (nrow(mut_pos_cols_clean) == length(unique(df$position)) ){
|
||||
print("PASS: lengths checked, assigning axis colours")
|
||||
my_axis_colours = mut_pos_cols_clean$lab_fg
|
||||
cat("length of axis colours:", length(my_axis_colours)
|
||||
, "\nwhich corresponds to the number of positions on the x-axis of the plot")
|
||||
}else{
|
||||
print("FAIL:lengths mismatch, could not assign axis colours")
|
||||
quit()
|
||||
}
|
||||
|
||||
# unique positions
|
||||
upos = unique(df$position); length(upos)
|
||||
|
||||
table(df$duet_outcome)
|
||||
|
||||
# add frequency of positions (from lib data.table)
|
||||
setDT(df)[, pos_count := .N, by = .(position)]
|
||||
|
||||
# this is cummulative
|
||||
table(df$pos_count)
|
||||
|
||||
# use group by on this
|
||||
library(dplyr)
|
||||
snpsBYpos_df <- df %>%
|
||||
group_by(position) %>%
|
||||
summarize(snpsBYpos = mean(pos_count))
|
||||
|
||||
table(snpsBYpos_df$snpsBYpos)
|
||||
|
||||
snp_count = sort(unique(snpsBYpos_df$snpsBYpos))
|
||||
|
||||
# sanity checks
|
||||
# should be a factor
|
||||
if (is.factor(df$duet_outcome)){
|
||||
print("duet_outcome is factor")
|
||||
}else{
|
||||
print("converting duet_outcome to factor")
|
||||
df$duet_outcome = as.factor(df$duet_outcome)
|
||||
}
|
||||
|
||||
is.factor(df$duet_outcome)
|
||||
|
||||
# should be -1 and 1
|
||||
min(df$duet_scaled)
|
||||
max(df$duet_scaled)
|
||||
|
||||
# sanity checks
|
||||
tapply(df$duet_scaled, df$duet_outcome, min)
|
||||
tapply(df$duet_scaled, df$duet_outcome, max)
|
||||
|
||||
# My colour FUNCTION: based on group and subgroup
|
||||
# in my case;
|
||||
# df = df
|
||||
# group = duet_outcome
|
||||
# subgroup = normalised score i.e duet_scaled
|
||||
|
||||
# check unique values in normalised data
|
||||
u = unique(df$duet_scaled)
|
||||
cat("No. of unique values in normalised data:", length(u))
|
||||
|
||||
# Define group
|
||||
# Create an extra column called group which contains the "gp name and score"
|
||||
# so colours can be generated for each unique values in this column
|
||||
my_grp = df$duet_scaled # no rounding
|
||||
df$group <- paste0(df$duet_outcome, "_", my_grp, sep = "")
|
||||
|
||||
# Call the function to create the palette based on the group defined above
|
||||
colours <- ColourPalleteMulti(df, "duet_outcome", "my_grp")
|
||||
print(paste0("Colour palette generated for: ", length(colours), " colours"))
|
||||
my_title = "Protein stability (DUET)"
|
||||
cat("No. of axis colours: ", length(my_axis_colours))
|
||||
|
||||
#******************
|
||||
# generate plot: barplot unordered by frequency with axis colours
|
||||
#******************
|
||||
class(df$lab_bg)
|
||||
|
||||
# define cartesian coord
|
||||
my_xlim = length(unique(df$position)); my_xlim
|
||||
|
||||
# axis label size
|
||||
my_xals = 18
|
||||
my_yals = 18
|
||||
|
||||
# axes text size
|
||||
my_xats = 14
|
||||
my_yats = 18
|
||||
|
||||
g3 = ggplot(df, aes(factor(position, ordered = T)))
|
||||
|
||||
p3 = g3 +
|
||||
coord_cartesian(xlim = c(1, my_xlim)
|
||||
#, ylim = c(0, 6)
|
||||
, ylim = c(0, max(snp_count))
|
||||
, clip = "off") +
|
||||
geom_bar(aes(fill = group), colour = "grey") +
|
||||
scale_fill_manual(values = colours
|
||||
, guide = "none") +
|
||||
geom_tile(aes(,-0.8, width = 0.95, height = 0.85)
|
||||
, fill = df$lab_bg) +
|
||||
geom_tile(aes(,-1.2, width = 0.95, height = -0.2)
|
||||
, fill = df$lab_bg2) +
|
||||
|
||||
# Here it"s important to specify that your axis goes from 1 to max number of levels
|
||||
theme(axis.text.x = element_text(size = my_xats
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0.4
|
||||
, colour = my_axis_colours)
|
||||
, axis.text.y = element_text(size = my_yats
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_text(size = my_xals)
|
||||
, axis.title.y = element_text(size = my_yals+2 )
|
||||
, axis.ticks.x = element_blank()) +
|
||||
labs(title = ""
|
||||
#title = my_title
|
||||
, x = "Position"
|
||||
, y = "Frequency")
|
||||
|
||||
p3
|
||||
#=================
|
||||
# generate plot: AF by position unordered, not coloured
|
||||
#=================
|
||||
my_ats = 20 # axis text size
|
||||
my_als = 22 # axis label size
|
||||
|
||||
g1 = ggplot(df)
|
||||
p1 = g1 +
|
||||
geom_bar(aes(x = factor(position, ordered = T)
|
||||
, y = af*100
|
||||
#, fill = duet_outcome
|
||||
)
|
||||
, color = "black"
|
||||
, fill = "lightgrey"
|
||||
, stat = "identity") +
|
||||
theme(axis.text.x = element_blank()
|
||||
, axis.text.y = element_text(size = my_ats
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_blank()
|
||||
, axis.title.y = element_text(size = my_als) ) +
|
||||
labs(title = ""
|
||||
#, size = 100
|
||||
#, x = "Wild-type position"
|
||||
, y = "AF(%)")
|
||||
|
||||
p1
|
||||
#=================
|
||||
# generate plot: OR by position unordered
|
||||
#=================
|
||||
my_ats = 20 # axis text size
|
||||
my_als = 22 # axis label size
|
||||
|
||||
|
||||
g2 = ggplot(df)
|
||||
|
||||
p2 = g2 +
|
||||
geom_bar(aes(x = factor(position, ordered = T)
|
||||
, y = or_mychisq
|
||||
#, fill = duet_outcome
|
||||
)
|
||||
, colour = "black"
|
||||
, fill = "lightgray"
|
||||
, stat = "identity") +
|
||||
theme(axis.text.x = element_blank()
|
||||
, axis.text.y = element_text(size = my_ats
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_blank()
|
||||
, axis.title.y = element_text(size = my_als) ) +
|
||||
labs(#title = "OR by position"
|
||||
#, x = "Wild-type position"
|
||||
y = "OR")
|
||||
|
||||
p2
|
||||
|
||||
########################################################################
|
||||
# end of DUET barplots #
|
||||
########################################################################
|
||||
#=======
|
||||
# output
|
||||
#=======
|
||||
ps_plots_combined = "af_or_combined_PS.svg"
|
||||
plot_ps_plots_combined = paste0(plotdir,"/", ps_plots_combined )
|
||||
|
||||
svg(plot_ps_plots_combined , width = 26, height = 12)
|
||||
|
||||
#theme_set(theme_gray()) # to preserve default theme
|
||||
printFile = cowplot::plot_grid(p1, p2, p3
|
||||
, ncol = 1
|
||||
, align = 'v')
|
||||
printFile
|
||||
dev.off()
|
136
scripts/plotting/resolving_ambiguous_muts.R
Normal file
136
scripts/plotting/resolving_ambiguous_muts.R
Normal file
|
@ -0,0 +1,136 @@
|
|||
#!/usr/bin/env Rscript
|
||||
#########################################################
|
||||
# TASK: To resolve ambiguous muts present in <gene>_metadata.csv
|
||||
#(which is one of the outputs from python script)
|
||||
|
||||
# Input csv file: <gene>_metadata.csv
|
||||
|
||||
# Output csv file: <gene>_metadata_clean.csv
|
||||
|
||||
#########################################################
|
||||
#=======================================================================
|
||||
# working dir and loading libraries
|
||||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/scripts/plotting/")
|
||||
getwd()
|
||||
|
||||
source("dirs.R")
|
||||
|
||||
cat("Directories imported:"
|
||||
, "\n===================="
|
||||
, "\ndatadir:", datadir
|
||||
, "\nindir:", indir
|
||||
, "\noutdir:", outdir
|
||||
, "\nplotdir:", plotdir)
|
||||
|
||||
cat("Variables imported:"
|
||||
, "\n====================="
|
||||
, "\ndrug:", drug
|
||||
, "\ngene:", gene
|
||||
, "\ngene_match:", gene_match
|
||||
, "\ndr_muts_col:", dr_muts_col
|
||||
, "\nother_muts_col:", other_muts_col
|
||||
, "\ndrtype_col:", resistance_col)
|
||||
|
||||
#========================================================
|
||||
#===========
|
||||
# input
|
||||
#===========
|
||||
# infile1: gene associated meta data
|
||||
gene_metadata_raw = paste0(tolower(gene), "_metadata_raw.csv")
|
||||
infile_gene_metadata_raw = paste0(outdir, "/", gene_metadata_raw)
|
||||
cat("Input infile 1:", infile_gene_metadata_raw)
|
||||
|
||||
# infile 2: ambiguous muts
|
||||
ambiguous_muts = paste0(tolower(gene), "_ambiguous_mut_names.csv")
|
||||
infile_ambiguous_muts = paste0(outdir, "/", ambiguous_muts)
|
||||
cat("Input infile 2:", infile_ambiguous_muts)
|
||||
|
||||
#===========
|
||||
# output
|
||||
#===========
|
||||
# clean gene_metadat with ambiguous muts resolved
|
||||
gene_metadata_clean = paste0(tolower(gene), "_metadata.csv")
|
||||
outfile_gene_metadata_clean = paste0(outdir, "/", gene_metadata_clean)
|
||||
cat("Output file:", outfile_gene_metadata_clean)
|
||||
|
||||
#%%===============================================================
|
||||
|
||||
###########################
|
||||
# Read file: <gene>_meta data raw.csv
|
||||
###########################
|
||||
cat("Reading meta data file:", infile_gene_metadata_raw)
|
||||
|
||||
gene_metadata_raw <- read.csv(infile_gene_metadata_raw
|
||||
, stringsAsFactors = F
|
||||
, header = T)
|
||||
|
||||
cat("Dim:", dim(gene_metadata_raw))
|
||||
|
||||
###########################
|
||||
# Read file: ambiguous muts.csv
|
||||
##########################
|
||||
ambiguous_muts = read.csv(infile_ambiguous_muts)
|
||||
|
||||
ambiguous_muts_names = ambiguous_muts$mutation
|
||||
|
||||
common_muts_all = gene_metadata_raw[gene_metadata_raw$mutation%in%ambiguous_muts_names,]
|
||||
|
||||
#==============
|
||||
# resolving ambiguous muts
|
||||
#===============
|
||||
table(gene_metadata_raw$mutation_info)
|
||||
count_check = as.data.frame(cbind(table(gene_metadata_raw$mutationinformation, gene_metadata_raw$mutation_info)))
|
||||
count_check$checks = ifelse(count_check[[dr_muts_col]]&count_check[[other_muts_col]]>0
|
||||
, "ambi", "pass")
|
||||
table(count_check$checks)
|
||||
|
||||
cat(table(count_check$checks)[["ambi"]], "ambiguous muts detected. Correcting these")
|
||||
|
||||
# remove all ambiguous muts from df
|
||||
ids = gene_metadata_raw$mutation%in%common_muts_all$mutation; table(ids)
|
||||
gene_metadata_raw_unambiguous = gene_metadata_raw[!ids,]
|
||||
|
||||
# sanity checks: should be true
|
||||
table(gene_metadata_raw_unambiguous$mutation%in%common_muts_all$mutation)[[1]] == nrow(gene_metadata_raw_unambiguous)
|
||||
nrow(gene_metadata_raw_unambiguous) + nrow(common_muts_all) == nrow(gene_metadata_raw)
|
||||
|
||||
# check before resolving ambiguous muts
|
||||
table(common_muts_all$mutation_info)
|
||||
common_muts_all$mutation_info = as.factor(common_muts_all$mutation_info)
|
||||
|
||||
# resolving ambiguous muts: change other_muts to dr_muts
|
||||
common_muts_all$mutation_info[common_muts_all$mutation_info==other_muts_col] <- dr_muts_col
|
||||
|
||||
table(common_muts_all$mutation_info)
|
||||
common_muts_all$mutation_info = factor(common_muts_all$mutation_info)
|
||||
table(common_muts_all$mutation_info)
|
||||
common_muts_all$mutation_info = as.character(common_muts_all$mutation_info)
|
||||
|
||||
# combining to a clean df
|
||||
gene_metadata = rbind(gene_metadata_raw_unambiguous, common_muts_all)
|
||||
all(dim(gene_metadata) == dim(gene_metadata))
|
||||
table(gene_metadata$mutation_info)
|
||||
|
||||
count_check2 = as.data.frame(cbind(table(gene_metadata$mutationinformation
|
||||
, gene_metadata$mutation_info)))
|
||||
|
||||
count_check2$checks = ifelse(count_check2[[dr_muts_col]]&count_check2[[other_muts_col]]>0
|
||||
, "ambi", "pass")
|
||||
|
||||
if (table(count_check2$checks) == nrow(count_check2)){
|
||||
cat("PASS: ambiguous muts successfully resolved."
|
||||
, "\nwriting file")
|
||||
}else{
|
||||
print("FAIL: ambiguous muts could not be resolved!")
|
||||
quit()
|
||||
}
|
||||
#============
|
||||
# writing file
|
||||
#=============
|
||||
write.csv(gene_metadata, outfile_gene_metadata_clean
|
||||
, row.names = FALSE)
|
||||
|
||||
#=====================================================================
|
||||
# End of script
|
||||
#======================================================================
|
|
@ -1,12 +1,26 @@
|
|||
#========
|
||||
# plotting
|
||||
#========
|
||||
FIXME:
|
||||
|
||||
#=======================
|
||||
dirs.R
|
||||
#=======================
|
||||
#dirs.R: imports dir structure
|
||||
change this to cmd so you can pass in drug and gene name
|
||||
|
||||
#=======================
|
||||
plotting_data.R
|
||||
#=======================
|
||||
??? update how to run
|
||||
|
||||
|
||||
#=======================
|
||||
combining_dfs_plotting.R
|
||||
#=======================
|
||||
??? update how to run
|
||||
|
||||
|
||||
#=======================
|
||||
mcsm_mean_stability.R
|
||||
#=======================
|
||||
|
@ -58,6 +72,13 @@ barplots_subcolours_PS.R
|
|||
# output plots: 1 svg
|
||||
1) barplot_coloured_PS.svg
|
||||
|
||||
#=======================
|
||||
lineage_dist_PS.R
|
||||
# lineage distribution for all muts and dr muts in one figure
|
||||
#=======================
|
||||
# input: calls the "combining_dfs_plotting.R"
|
||||
# output plots: 1 svg
|
||||
1) lineage_dist_combined_PS.svg
|
||||
|
||||
|
||||
########################################################################
|
||||
|
|
96
scripts/plotting/test.R
Normal file
96
scripts/plotting/test.R
Normal file
|
@ -0,0 +1,96 @@
|
|||
setwd("/home/tanu/git/LSHTM_analysis/scripts/plotting")
|
||||
|
||||
source("combining_dfs_plotting.R")
|
||||
|
||||
table(merged_df3$mutation_info)
|
||||
|
||||
# assign foldx
|
||||
#ddg<0 = "Stabilising" (-ve)
|
||||
table(merged_df3$ddg < 0)
|
||||
merged_df3$foldx_outcome = ifelse(merged_df3$ddg < 0, "Stabilising", "Destabilising")
|
||||
|
||||
#===========
|
||||
# PS data
|
||||
#===========
|
||||
dr_muts = merged_df3[merged_df3$mutation_info == "dr_mutations_pyrazinamide",]
|
||||
other_muts = merged_df3[merged_df3$mutation_info == "other_mutations_pyrazinamide",]
|
||||
|
||||
par(mfrow = c(1,1))
|
||||
par(mfrow = c(2,6))
|
||||
|
||||
# mcsm duet
|
||||
boxplot(dr_muts$duet_scaled, other_muts$duet_scaled, main = "DUET"
|
||||
#, col = factor(merged_df3$duet_outcome)
|
||||
)
|
||||
wilcox.test(dr_muts$duet_scaled, other_muts$duet_scaled, paired = F)
|
||||
|
||||
# foldx ddg
|
||||
boxplot(dr_muts$ddg, other_muts$ddg, main = "Foldx")
|
||||
wilcox.test(dr_muts$ddg, other_muts$ddg, paired = F)
|
||||
|
||||
# rd
|
||||
boxplot(dr_muts$rd_values, other_muts$rd_values, main = "RD")
|
||||
wilcox.test(dr_muts$rd_values, other_muts$rd_values)
|
||||
|
||||
# kd
|
||||
boxplot(dr_muts$kd_values, other_muts$kd_values, main = "KD")
|
||||
wilcox.test(dr_muts$kd_values, other_muts$kd_values)
|
||||
|
||||
# asa
|
||||
boxplot(dr_muts$asa, other_muts$asa, main = "ASA")
|
||||
wilcox.test(dr_muts$asa, other_muts$asa)
|
||||
|
||||
# rsa
|
||||
boxplot(dr_muts$rsa, other_muts$rsa, main = "RSA")
|
||||
wilcox.test(dr_muts$rsa, other_muts$rsa)
|
||||
|
||||
#===================================================================
|
||||
#==========
|
||||
# LIG data
|
||||
#==========
|
||||
dr_muts_lig = merged_df3_lig[merged_df3_lig$mutation_info == "dr_mutations_pyrazinamide",]
|
||||
other_muts_lig = merged_df3_lig[merged_df3_lig$mutation_info == "other_mutations_pyrazinamide",]
|
||||
|
||||
# mcsm ligand affinity
|
||||
boxplot(dr_muts_lig$duet_scaled, other_muts_lig$duet_scaled, main = "Ligand affinity")
|
||||
wilcox.test(dr_muts_lig$duet_scaled, other_muts_lig$duet_scaled, paired = F)
|
||||
|
||||
# rd
|
||||
boxplot(dr_muts_lig$rd_values, other_muts_lig$rd_values, main = "RD")
|
||||
wilcox.test(dr_muts_lig$rd_values, other_muts_lig$rd_values)
|
||||
|
||||
# kd
|
||||
boxplot(dr_muts_lig$kd_values, other_muts_lig$kd_values, main = "KD")
|
||||
wilcox.test(dr_muts_lig$kd_values, other_muts_lig$kd_values)
|
||||
|
||||
# asa
|
||||
boxplot(dr_muts_lig$asa, other_muts_lig$asa, main = "ASA")
|
||||
wilcox.test(dr_muts_lig$asa, other_muts_lig$asa)
|
||||
|
||||
# rsa
|
||||
boxplot(dr_muts_lig$rsa, other_muts_lig$rsa, main = "RSA")
|
||||
wilcox.test(dr_muts_lig$rsa, other_muts_lig$rsa)
|
||||
|
||||
# checking agreement b/w mcsm and foldx
|
||||
cols_to_select = c("mutationinformation"
|
||||
, "mutation_info"
|
||||
, "duet_scaled"
|
||||
, "ddg"
|
||||
, "duet_outcome"
|
||||
, "foldx_outcome")
|
||||
|
||||
merged_df3_short = select(merged_df3, cols_to_select)
|
||||
|
||||
mcsm_foldx = merged_df3_short[which(merged_df3_short$duet_outcome != merged_df3_short$foldx_outcome),]
|
||||
|
||||
mcsm_foldx$sign_comp = ifelse(sign(mcsm_foldx$duet_scaled)==sign(mcsm_foldx$ddg), "PASS", "FAIL")
|
||||
table(mcsm_foldx$sign_comp)
|
||||
|
||||
# another way of checking
|
||||
merged_df3$sign_comp = ifelse(sign(merged_df3$duet_scaled)==sign(merged_df3$ddg), "PASS", "FAIL")
|
||||
table(merged_df3$sign_comp)
|
||||
|
||||
disagreement = table(merged_df3$sign_comp)[2]/nrow(merged_df3)*100
|
||||
agreement = 100 - disagreement
|
||||
|
||||
cat("There is", agreement, "% between mcsm and foldx predictions")
|
Loading…
Add table
Add a link
Reference in a new issue