Merge branch 'master' of github.com:tgttunstall/LSHTM_analysis

This commit is contained in:
Tanushree Tunstall 2020-09-10 16:14:46 +01:00
commit 5102bbea1b
21 changed files with 2132 additions and 243 deletions

1
.gitignore vendored
View file

@ -13,3 +13,4 @@ scratch
test test
plotting_test plotting_test
scripts_old scripts_old
foldx/test/

View file

@ -54,6 +54,8 @@ os.getcwd()
# FIXME: local imports # FIXME: local imports
#from combining import combine_dfs_with_checks #from combining import combine_dfs_with_checks
from combining_FIXME import detect_common_cols from combining_FIXME import detect_common_cols
from reference_dict import oneletter_aa_dict # CHECK DIR STRUC THERE!
from reference_dict import low_3letter_dict # CHECK DIR STRUC THERE!
#======================================================================= #=======================================================================
#%% command line args #%% command line args
arg_parser = argparse.ArgumentParser() arg_parser = argparse.ArgumentParser()
@ -71,13 +73,27 @@ args = arg_parser.parse_args()
#%% variable assignment: input and output #%% variable assignment: input and output
#drug = 'pyrazinamide' #drug = 'pyrazinamide'
#gene = 'pncA' #gene = 'pncA'
gene_match = gene + '_p.'
drug = args.drug drug = args.drug
gene = args.gene gene = args.gene
datadir = args.datadir datadir = args.datadir
indir = args.input_dir indir = args.input_dir
outdir = args.output_dir outdir = args.output_dir
gene_match = gene + '_p.'
print('mut pattern for gene', gene, ':', gene_match)
nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
print('nsSNP for gene', gene, ':', nssnp_match)
wt_regex = gene_match.lower()+'([A-Za-z]{3})'
print('wt regex:', wt_regex)
mut_regex = r'[0-9]+(\w{3})$'
print('mt regex:', mut_regex)
pos_regex = r'([0-9]+)'
print('position regex:', pos_regex)
#%%======================================================================= #%%=======================================================================
#============== #==============
# directories # directories
@ -155,6 +171,8 @@ ncols_m1 = len(mcsm_foldx_dfs.columns)
print('\n\nResult of first merge:', mcsm_foldx_dfs.shape print('\n\nResult of first merge:', mcsm_foldx_dfs.shape
, '\n===================================================================') , '\n===================================================================')
mcsm_foldx_dfs[merging_cols_m1].apply(len)
mcsm_foldx_dfs[merging_cols_m1].apply(len) == len(mcsm_foldx_dfs)
#%%============================================================================ #%%============================================================================
print('===================================' print('==================================='
, '\nSecond merge: dssp + kd' , '\nSecond merge: dssp + kd'
@ -183,6 +201,8 @@ ncols_m3 = len(dssp_kd_rd_dfs.columns)
print('\n\nResult of Third merge:', dssp_kd_rd_dfs.shape print('\n\nResult of Third merge:', dssp_kd_rd_dfs.shape
, '\n===================================================================') , '\n===================================================================')
dssp_kd_rd_dfs[merging_cols_m3].apply(len)
dssp_kd_rd_dfs[merging_cols_m3].apply(len) == len(dssp_kd_rd_dfs)
#%%============================================================================ #%%============================================================================
print('=======================================' print('======================================='
, '\nFourth merge: First merge + Third merge' , '\nFourth merge: First merge + Third merge'
@ -203,12 +223,16 @@ else:
print('\nResult of Fourth merge:', combined_df.shape print('\nResult of Fourth merge:', combined_df.shape
, '\n===================================================================') , '\n===================================================================')
combined_df[merging_cols_m4].apply(len)
combined_df[merging_cols_m4].apply(len) == len(combined_df)
#%%============================================================================ #%%============================================================================
# OR merges: TEDIOUSSSS!!!! # OR merges: TEDIOUSSSS!!!!
del(mcsm_df, foldx_df, mcsm_foldx_dfs, dssp_kd_dfs, dssp_kd_rd_dfs,rd_df, kd_df, infile_mcsm, infile_foldx, infile_dssp, infile_kd)
#%%RRRR del(merging_cols_m1, merging_cols_m2, merging_cols_m3, merging_cols_m4)
del(in_filename_dssp, in_filename_foldx, in_filename_kd, in_filename_mcsm, in_filename_rd)
#%%
print('===================================' print('==================================='
, '\nFifth merge: afor_df + afor_kin_df' , '\nFifth merge: afor_df + afor_kin_df'
, '\n===================================') , '\n===================================')
@ -220,8 +244,6 @@ afor_df = pd.read_csv(infile_afor, sep = ',')
afor_kin_df = pd.read_csv(infile_afor_kin, sep = ',') afor_kin_df = pd.read_csv(infile_afor_kin, sep = ',')
afor_kin_df.columns = afor_kin_df.columns.str.lower() afor_kin_df.columns = afor_kin_df.columns.str.lower()
merging_cols_m5 = detect_common_cols(afor_df, afor_kin_df) merging_cols_m5 = detect_common_cols(afor_df, afor_kin_df)
print('Dim of afor_df:', afor_df.shape print('Dim of afor_df:', afor_df.shape
@ -230,7 +252,7 @@ print('Dim of afor_df:', afor_df.shape
# finding if ALL afor_kin_df muts are present in afor_df # finding if ALL afor_kin_df muts are present in afor_df
# i.e all kinship muts should be PRESENT in mycalcs_present # i.e all kinship muts should be PRESENT in mycalcs_present
if len(afor_kin_df[afor_kin_df['mutation'].isin(afor_df['mutation'])]) == afor_kin_df.shape[0]: if len(afor_kin_df[afor_kin_df['mutation'].isin(afor_df['mutation'])]) == afor_kin_df.shape[0]:
print('PASS: ALL or_kinship muts are present in my or list') print('PASS: ALL', len(afor_kin_df), 'or_kinship muts are present in my or list')
else: else:
nf_muts = len(afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])]) nf_muts = len(afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])])
nf_muts_df = afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])] nf_muts_df = afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])]
@ -244,7 +266,7 @@ common_muts = len(afor_df[afor_df['mutation'].isin(afor_kin_df['mutation'])])
extra_muts_myor = afor_kin_df.shape[0] - common_muts extra_muts_myor = afor_kin_df.shape[0] - common_muts
print('==========================================' print('=========================================='
, '\nmy or calcs', extra_muts_myor, 'extra mutation\n' , '\nmy or calcs has', common_muts, 'present in af_or_kin_df'
, '\n==========================================') , '\n==========================================')
print('Expected cals for merging with outer_join...') print('Expected cals for merging with outer_join...')
@ -252,23 +274,29 @@ print('Expected cals for merging with outer_join...')
expected_rows = afor_df.shape[0] + extra_muts_myor expected_rows = afor_df.shape[0] + extra_muts_myor
expected_cols = afor_df.shape[1] + afor_kin_df.shape[1] - len(merging_cols_m5) expected_cols = afor_df.shape[1] + afor_kin_df.shape[1] - len(merging_cols_m5)
afor_df['mutation']
afor_kin_df['mutation']
ors_df = pd.merge(afor_df, afor_kin_df, on = merging_cols_m5, how = o_join) ors_df = pd.merge(afor_df, afor_kin_df, on = merging_cols_m5, how = o_join)
if ors_df.shape[0] == expected_rows and ors_df.shape[1] == expected_cols: if ors_df.shape[0] == expected_rows and ors_df.shape[1] == expected_cols:
print('PASS: OR dfs successfully combined! PHEWWWW!') print('PASS but with duplicate muts: OR dfs successfully combined! PHEWWWW!'
, '\nDuplicate muts present but with different \'ref\' and \'alt\' alleles')
else: else:
print('FAIL: could not combine OR dfs' print('FAIL: could not combine OR dfs'
, '\nCheck expected rows and cols calculation and join type') , '\nCheck expected rows and cols calculation and join type')
print('Dim of merged ors_df:', ors_df.shape) print('Dim of merged ors_df:', ors_df.shape)
ors_df[merging_cols_m5].apply(len)
ors_df[merging_cols_m5].apply(len) == len(ors_df)
#%%============================================================================ #%%============================================================================
# formatting ors_df # formatting ors_df
ors_df.columns ors_df.columns
# Dropping unncessary columns: already removed in ealier preprocessing # Dropping unncessary columns: already removed in ealier preprocessing
#cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ]
cols_to_drop = ['n_miss'] cols_to_drop = ['n_miss']
print('Dropping', len(cols_to_drop), 'columns:\n' print('Dropping', len(cols_to_drop), 'columns:\n'
, cols_to_drop) , cols_to_drop)
@ -283,18 +311,13 @@ column_order = ['mutation'
, 'wild_type' , 'wild_type'
, 'position' , 'position'
, 'mutant_type' , 'mutant_type'
#, 'chr_num_allele' #old
, 'ref_allele' , 'ref_allele'
, 'alt_allele' , 'alt_allele'
, 'mut_info_f1' , 'mut_info_f1'
, 'mut_info_f2' , 'mut_info_f2'
, 'mut_type' , 'mut_type'
, 'gene_id' , 'gene_id'
#, 'gene_number' #old
, 'gene_name' , 'gene_name'
#, 'mut_region'
#, 'reference_allele'
#, 'alternate_allele'
, 'chromosome_number' , 'chromosome_number'
, 'af' , 'af'
, 'af_kin' , 'af_kin'
@ -317,14 +340,9 @@ column_order = ['mutation'
, 'se_kin' , 'se_kin'
, 'zval_logistic' , 'zval_logistic'
, 'logl_h1_kin' , 'logl_h1_kin'
, 'l_remle_kin' , 'l_remle_kin']
#, 'wt_3let' # old
#, 'mt_3let' # old
#, 'symbol'
#, 'n_miss'
]
if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all()): if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all())):
print('PASS: Column order generated for all:', len(column_order), 'columns' print('PASS: Column order generated for all:', len(column_order), 'columns'
, '\nColumn names match, safe to reorder columns' , '\nColumn names match, safe to reorder columns'
, '\nApplying column order to df...' ) , '\nApplying column order to df...' )
@ -337,6 +355,61 @@ else:
print('\nResult of Sixth merge:', ors_df_ordered.shape print('\nResult of Sixth merge:', ors_df_ordered.shape
, '\n===================================================================') , '\n===================================================================')
#%%
ors_df_ordered.shape
check = ors_df_ordered[['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type']]
# populating 'nan' info
lookup_dict = dict()
for k, v in low_3letter_dict.items():
lookup_dict[k] = v['one_letter_code']
#print(lookup_dict)
wt = ors_df_ordered['mutation'].str.extract(wt_regex).squeeze()
#print(wt)
ors_df_ordered['wild_type'] = wt.map(lookup_dict)
ors_df_ordered['position'] = ors_df_ordered['mutation'].str.extract(pos_regex)
mt = ors_df_ordered['mutation'].str.extract(mut_regex).squeeze()
ors_df_ordered['mutant_type'] = mt.map(lookup_dict)
ors_df_ordered['mutationinformation'] = ors_df_ordered['wild_type'] + ors_df_ordered.position.map(str) + ors_df_ordered['mutant_type']
check = ors_df_ordered[['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type']]
# populate mut_info_f1
ors_df_ordered['mut_info_f1'].isna().sum()
ors_df_ordered['mut_info_f1'] = ors_df_ordered['position'].astype(str) + ors_df_ordered['wild_type'] + '>' + ors_df_ordered['position'].astype(str) + ors_df_ordered['mutant_type']
ors_df_ordered['mut_info_f1'].isna().sum()
# populate mut_info_f2
ors_df_ordered['mut_info_f2'] = ors_df_ordered['mutation'].str.replace(gene_match.lower(), 'p.', regex = True)
# populate mut_type
ors_df_ordered['mut_type'].isna().sum()
#mut_type_word = ors_df_ordered['mut_type'].value_counts()
mut_type_word = 'missense' # FIXME, should be derived
ors_df_ordered['mut_type'].fillna(mut_type_word, inplace = True)
ors_df_ordered['mut_type'].isna().sum()
# populate gene_id
ors_df_ordered['gene_id'].isna().sum()
#gene_id_word = ors_df_ordered['gene_id'].value_counts()
gene_id_word = 'Rv2043c' # FIXME, should be derived
ors_df_ordered['gene_id'].fillna(gene_id_word, inplace = True)
ors_df_ordered['gene_id'].isna().sum()
# populate gene_name
ors_df_ordered['gene_name'].isna().sum()
ors_df_ordered['gene_name'].value_counts()
ors_df_ordered['gene_name'].fillna(gene, inplace = True)
ors_df_ordered['gene_name'].isna().sum()
# check numbers
ors_df_ordered['or_kin'].isna().sum()
# should be 0
ors_df_ordered['or_mychisq'].isna().sum()
#%%============================================================================ #%%============================================================================
print('===================================' print('==================================='
, '\nSixth merge: Fourth + Fifth merge' , '\nSixth merge: Fourth + Fifth merge'
@ -345,52 +418,158 @@ print('==================================='
#combined_df_all = combine_dfs_with_checks(combined_df, ors_df_ordered, my_join = i_join) #combined_df_all = combine_dfs_with_checks(combined_df, ors_df_ordered, my_join = i_join)
merging_cols_m6 = detect_common_cols(combined_df, ors_df_ordered) merging_cols_m6 = detect_common_cols(combined_df, ors_df_ordered)
# dtype problems
if len(merging_cols_m6) > 1 and 'position'in merging_cols_m6:
print('Removing \'position\' from merging_cols_m6 to make dtypes consistent'
, '\norig length of merging_cols_m6:', len(merging_cols_m6))
merging_cols_m6.remove('position')
print('\nlength after removing:', len(merging_cols_m6))
print('Dim of df1:', combined_df.shape print('Dim of df1:', combined_df.shape
, '\nDim of df2:', ors_df_ordered.shape , '\nDim of df2:', ors_df_ordered.shape
, '\nNo. of merging_cols:', len(merging_cols_m6)) , '\nNo. of merging_cols:', len(merging_cols_m6))
print('Checking mutations in the two dfs:' print('Checking mutations in the two dfs:'
, '\nmuts in df1 but NOT in df2:' , '\nmuts in df1 present in df2:'
, combined_df['mutationinformation'].isin(ors_df_ordered['mutationinformation']).sum() , combined_df['mutationinformation'].isin(ors_df_ordered['mutationinformation']).sum()
, '\nmuts in df2 but NOT in df1:' , '\nmuts in df2 present in df1:'
, ors_df_ordered['mutationinformation'].isin(combined_df['mutationinformation']).sum()) , ors_df_ordered['mutationinformation'].isin(combined_df['mutationinformation']).sum())
#print('\nNo. of common muts:', np.intersect1d(combined_df['mutationinformation'], ors_df_ordered['mutationinformation']) ) #----------
# merge 6
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #----------
combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m6, how = l_join) combined_df_all = pd.merge(combined_df, ors_df_ordered, on = merging_cols_m6, how = o_join)
combined_df_all.shape combined_df_all.shape
# FIXME: DIM # sanity check for merge 6
# only with left join! outdf_expected_rows = len(combined_df) + extra_muts_myor
outdf_expected_rows = len(combined_df) unique_muts = len(combined_df)
outdf_expected_cols = len(combined_df.columns) + len(ors_df_ordered.columns) - len(merging_cols_m6) outdf_expected_cols = len(combined_df.columns) + len(ors_df_ordered.columns) - len(merging_cols_m6)
#if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all.shape[0] == outdf_expected_rows: if combined_df_all.shape[0] == outdf_expected_rows and combined_df_all.shape[1] == outdf_expected_cols and combined_df_all['mutationinformation'].nunique() == unique_muts:
if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all['mutationinformation'].nunique() == outdf_expected_rows:
print('PASS: Df dimension match' print('PASS: Df dimension match'
, '\nDim of combined_df_all with join type:', l_join , '\ncombined_df_all with join type:', o_join
, '\n', combined_df_all.shape , '\n', combined_df_all.shape
, '\n===============================================================') , '\n===============================================================')
else: else:
print('FAIL: Df dimension mismatch' print('FAIL: Df dimension mismatch'
, 'Cannot generate expected dim. See details of merge performed' , 'Cannot generate expected dim. See details of merge performed'
, '\ndf1 dim:', combined_df.shape , '\ndf1 dim:', combined_df.shape
, '\ndf2 dim:', ors_df.shape , '\ndf2 dim:', ors_df_ordered.shape
, '\nGot:', combined_df_all.shape , '\nGot:', combined_df_all.shape
, '\nmuts in df1 but NOT in df2:' , '\nmuts in df1 but NOT in df2:'
, combined_df['mutationinformation'].isin(ors_df['mutationinformation']).sum() , combined_df['mutationinformation'].isin(ors_df_ordered['mutationinformation']).sum()
, '\nmuts in df2 but NOT in df1:' , '\nmuts in df2 but NOT in df1:'
, ors_df['mutationinformation'].isin(combined_df['mutationinformation']).sum()) , ors_df['mutationinformation'].isin(combined_df['mutationinformation']).sum())
sys.exit() sys.exit()
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# nan in mutation col # drop extra cols
# FIXME: should get fixmed with JP's resolved dataset!? all_cols = combined_df_all.columns
combined_df_all['mutation'].isna().sum()
baz = combined_df_all[combined_df_all['mutation'].isna()] #pos_cols_check = combined_df_all[['position_x','position_y']]
c = combined_df_all[['position_x','position_y']].isna().sum()
pos_col_to_drop = c.index[c>0].to_list()
cols_to_drop = pos_col_to_drop + ['wild_type_kd']
print('Dropping', len(cols_to_drop), 'columns:\n', cols_to_drop)
combined_df_all.drop(cols_to_drop, axis = 1, inplace = True)
# rename position_x to position
pos_col_to_rename = c.index[c==0].to_list()
combined_df_all.shape
combined_df_all.rename(columns = { pos_col_to_rename[0]: 'position'}, inplace = True)
combined_df_all.shape
all_cols = combined_df_all.columns
#%% reorder cols to for convenience
first_cols = ['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type']
last_cols = [col for col in combined_df_all.columns if col not in first_cols]
combined_df_all = combined_df_all[first_cols+last_cols]
#%% IMPORTANT: check if mutation related info is all populated after this merge
# select string colnames to ensure no NA exist there
string_cols = combined_df_all.columns[combined_df_all.applymap(lambda x: isinstance(x, str)).all(0)]
if (combined_df_all[string_cols].isna().sum(axis = 0)== 0).all():
print('PASS: All string cols are populated with no NAs')
else:
print('FAIL: NAs detected in string cols')
print(combined_df_all[string_cols].isna().sum(axis = 0))
sys.exit()
# relevant mut cols
check_mut_cols = merging_cols_m5 + merging_cols_m6
count_na_mut_cols = combined_df_all[check_mut_cols].isna().sum().reset_index().rename(columns = {'index': 'col_name', 0: 'na_count'})
print(check_mut_cols)
c2 = combined_df_all[check_mut_cols].isna().sum()
missing_info_cols = c2.index[c2>0].to_list()
if c2.sum()>0:
#na_muts_n = combined_df_all['mutation'].isna().sum()
na_muts_n = combined_df_all[missing_info_cols].isna().sum()
print(na_muts_n.values[0], 'mutations have missing \'mutation\' info.'
, '\nFetching these from reference dict...')
else:
print('No missing \'mutation\' has been detected!')
lookup_dict = dict()
for k, v in oneletter_aa_dict.items():
lookup_dict[k] = v['three_letter_code_lower']
print(lookup_dict)
wt_3let = combined_df_all['wild_type'].map(lookup_dict)
#print(wt_3let)
pos = combined_df_all['position'].astype(str)
#print(pos)
mt_3let = combined_df_all['mutant_type'].map(lookup_dict)
#print(mt_3let)
# override the 'mutation' column
combined_df_all['mutation'] = 'pnca_p.' + wt_3let + pos + mt_3let
print(combined_df_all['mutation'])
# check again
if combined_df_all[missing_info_cols].isna().sum().all() == 0:
print('PASS: No mutations have missing \'mutation\' info.')
else:
print('FAIL:', combined_df_all[missing_info_cols].isna().sum().values[0]
, '\nmutations have missing info STILL...')
sys.exit()
#%% check
foo = combined_df_all.drop_duplicates('mutationinformation')
foo2 = combined_df_all.drop_duplicates('mutation')
if foo.equals(foo2):
print('PASS: Dropping mutation or mutatationinformation has the same effect\n')
else:
print('FAIL: Still problems in merged data')
sys.exit()
#%%============================================================================ #%%============================================================================
output_cols = combined_df_all.columns output_cols = combined_df_all.columns
print('Output cols:', output_cols)
#%% IMPORTANT result info
if combined_df_all['or_mychisq'].isna().sum() == len(combined_df) - len(afor_df):
print('PASS: No. of NA in or_mychisq matches expected length'
, '\nNo. of with NA in or_mychisq:', combined_df_all['or_mychisq'].isna().sum()
, '\nNo. of NA in or_kin:', combined_df_all['or_kin'].isna().sum())
else:
print('FAIL: No. of NA in or_mychisq does not match expected length')
if combined_df_all.shape[0] == outdf_expected_rows:
print('\nINFORMARIONAL ONLY: combined_df_all has duplicate muts present but with unique ref and alt allele'
, '\n=============================================================')
else:
print('combined_df_all has no duplicate muts present'
,'\n===============================================================')
print('\nDim of combined_data:', combined_df_all.shape
, '\nNo. of unique mutations:', combined_df_all['mutationinformation'].nunique())
#%%============================================================================ #%%============================================================================
# write csv # write csv

View file

@ -45,8 +45,6 @@ Created on Tue Aug 6 12:56:03 2019
#5. chain #5. chain
#6. wild_pos #6. wild_pos
#7. wild_chain_pos #7. wild_chain_pos
#======================================================================= #=======================================================================
#%% load libraries #%% load libraries
import os, sys import os, sys
@ -83,16 +81,16 @@ gene = args.gene
gene_match = gene + '_p.' gene_match = gene + '_p.'
print('mut pattern for gene', gene, ':', gene_match) print('mut pattern for gene', gene, ':', gene_match)
nssnp_match = gene_match +'[A-Z]{3}[0-9]+[A-Z]{3}' nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
print('nsSNP for gene', gene, ':', nssnp_match) print('nsSNP for gene', gene, ':', nssnp_match)
wt_regex = gene_match.lower()+'(\w{3})' wt_regex = gene_match.lower()+'([A-Za-z]{3})'
print('wt regex:', wt_regex) print('wt regex:', wt_regex)
mut_regex = r'\d+(\w{3})$' mut_regex = r'[0-9]+(\w{3})$'
print('mt regex:', mut_regex) print('mt regex:', mut_regex)
pos_regex = r'(\d+)' pos_regex = r'([0-9]+)'
print('position regex:', pos_regex) print('position regex:', pos_regex)
# building cols to extract # building cols to extract
@ -156,30 +154,29 @@ if in_filename_master == 'original_tanushree_data_v2.csv':
else: else:
core_cols = ['id' core_cols = ['id'
, 'sample' , 'sample'
, 'patient_id' #, 'patient_id'
, 'strain' #, 'strain'
, 'lineage' , 'lineage'
, 'sublineage' , 'sublineage'
, 'country' #, 'country'
, 'country_code' , 'country_code'
, 'geographic_source' , 'geographic_source'
#, 'region' #, 'region'
, 'location' #, 'location'
, 'host_body_site' #, 'host_body_site'
, 'environment_material' #, 'environment_material'
, 'host_status' #, 'host_status'
, 'host_sex' #, 'host_sex'
, 'submitted_host_sex' #, 'submitted_host_sex'
, 'hiv_status' #, 'hiv_status'
, 'HIV_status' #, 'HIV_status'
, 'tissue_type' #, 'tissue_type'
, 'isolation_source' #, 'isolation_source'
, resistance_col] , resistance_col]
variable_based_cols = [drug variable_based_cols = [drug
, dr_muts_col , dr_muts_col
, other_muts_col] , other_muts_col]
#, resistance_col]
cols_to_extract = core_cols + variable_based_cols cols_to_extract = core_cols + variable_based_cols
print('Extracting', len(cols_to_extract), 'columns from master data') print('Extracting', len(cols_to_extract), 'columns from master data')
@ -202,7 +199,7 @@ print('No. of NAs/column:' + '\n', meta_data.isna().sum()
#%% Write check file #%% Write check file
check_file = outdir + '/' + gene.lower() + '_gwas.csv' check_file = outdir + '/' + gene.lower() + '_gwas.csv'
meta_data.to_csv(check_file) meta_data.to_csv(check_file, index = False)
print('Writing subsetted gwas data' print('Writing subsetted gwas data'
, '\nFile', check_file , '\nFile', check_file
, '\nDim:', meta_data.shape) , '\nDim:', meta_data.shape)
@ -217,9 +214,9 @@ print('Writing subsetted gwas data'
# drug counts: complete samples for OR calcs # drug counts: complete samples for OR calcs
meta_data[drug].value_counts() meta_data[drug].value_counts()
print('===========================================================\n' print('===========================================================\n'
, 'RESULT: No. of Sus and Res samples:\n', meta_data[drug].value_counts() , 'RESULT: No. of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts()
, '\n===========================================================\n' , '\n===========================================================\n'
, 'RESULT: Percentage of Sus and Res samples:\n', meta_data[drug].value_counts(normalize = True)*100 , 'RESULT: Percentage of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100
, '\n===========================================================') , '\n===========================================================')
#%% #%%
@ -1173,7 +1170,7 @@ del(out_filename_metadata_poscounts)
#%% Write file: gene_metadata (i.e gene_LF1) #%% Write file: gene_metadata (i.e gene_LF1)
# where each row has UNIQUE mutations NOT unique sample ids # where each row has UNIQUE mutations NOT unique sample ids
out_filename_metadata = gene.lower() + '_metadata.csv' out_filename_metadata = gene.lower() + '_metadata_raw.csv'
outfile_metadata = outdir + '/' + out_filename_metadata outfile_metadata = outdir + '/' + out_filename_metadata
print('Writing file: LF formatted data' print('Writing file: LF formatted data'
, '\nFile:', outfile_metadata , '\nFile:', outfile_metadata

211
scripts/ks_test_PS.R Normal file
View file

@ -0,0 +1,211 @@
#!/usr/bin/env Rscript
#########################################################
# TASK: KS test for PS/DUET lineage distributions
#=======================================================================
#=======================================================================
# working dir and loading libraries
getwd()
setwd("~/git/LSHTM_analysis/scripts/")
getwd()
#source("/plotting/Header_TT.R")
#source("../barplot_colour_function.R")
#require(data.table)
source("plotting/combining_dfs_plotting.R")
# should return the following dfs, directories and variables
# PS combined:
# 1) merged_df2
# 2) merged_df2_comp
# 3) merged_df3
# 4) merged_df3_comp
# LIG combined:
# 5) merged_df2_lig
# 6) merged_df2_comp_lig
# 7) merged_df3_lig
# 8) merged_df3_comp_lig
# 9) my_df_u
# 10) my_df_u_lig
cat(paste0("Directories imported:"
, "\ndatadir:", datadir
, "\nindir:", indir
, "\noutdir:", outdir
, "\nplotdir:", plotdir))
cat(paste0("Variables imported:"
, "\ndrug:", drug
, "\ngene:", gene
, "\ngene_match:", gene_match
, "\nAngstrom symbol:", angstroms_symbol
, "\nNo. of duplicated muts:", dup_muts_nu
, "\nNA count for ORs:", na_count
, "\nNA count in df2:", na_count_df2
, "\nNA count in df3:", na_count_df3))
###########################
# Data for stats
# you need merged_df2 or merged_df2_comp
# since this is one-many relationship
# i.e the same SNP can belong to multiple lineages
# using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available, hence use df with NA
###########################
# REASSIGNMENT
my_df = merged_df2
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
is.factor(my_df$lineage)
my_df$lineage = as.factor(my_df$lineage)
is.factor(my_df$lineage)
table(my_df$mutation_info); str(my_df$mutation_info)
# subset df with dr muts only
my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide")
table(my_df_dr$mutation_info)
# stats for all muts and dr_muts
# 1) for all muts
# 2) for dr_muts
#===========================
table(my_df$lineage); str(my_df$lineage)
table(my_df_dr$lineage); str(my_df_dr$lineage)
# subset only lineages1-4
sel_lineages = c("lineage1"
, "lineage2"
, "lineage3"
, "lineage4")
# subset and refactor: all muts
df_lin = subset(my_df, subset = lineage %in% sel_lineages)
df_lin$lineage = factor(df_lin$lineage)
# subset and refactor: dr muts
df_lin_dr = subset(my_df_dr, subset = lineage %in% sel_lineages)
df_lin_dr$lineage = factor(df_lin_dr$lineage)
#{RESULT: No of samples within lineage}
table(df_lin$lineage)
table(df_lin_dr$lineage)
#{Result: No. of unique mutations the 4 lineages contribute to}
length(unique(df_lin$mutationinformation))
length(unique(df_lin_dr$mutationinformation))
# COMPARING DISTRIBUTIONS
#================
# ALL mutations
#=================
head(df_lin$lineage)
df_lin$lineage = as.character(df_lin$lineage)
lin1 = df_lin[df_lin$lineage == "lineage1",]$duet_scaled
lin2 = df_lin[df_lin$lineage == "lineage2",]$duet_scaled
lin3 = df_lin[df_lin$lineage == "lineage3",]$duet_scaled
lin4 = df_lin[df_lin$lineage == "lineage4",]$duet_scaled
# ks test
lin12 = ks.test(lin1,lin2)
lin12_df = as.data.frame(cbind(lin12$data.name, lin12$p.value))
lin13 = ks.test(lin1,lin3)
lin13_df = as.data.frame(cbind(lin13$data.name, lin13$p.value))
lin14 = ks.test(lin1,lin4)
lin14_df = as.data.frame(cbind(lin14$data.name, lin14$p.value))
lin23 = ks.test(lin2,lin3)
lin23_df = as.data.frame(cbind(lin23$data.name, lin23$p.value))
lin24 = ks.test(lin2,lin4)
lin24_df = as.data.frame(cbind(lin24$data.name, lin24$p.value))
lin34 = ks.test(lin3,lin4)
lin34_df = as.data.frame(cbind(lin34$data.name, lin34$p.value))
ks_results_all = rbind(lin12_df
, lin13_df
, lin14_df
, lin23_df
, lin24_df
, lin34_df)
#p-value < 2.2e-16
rm(lin12, lin12_df
, lin13, lin13_df
, lin14, lin14_df
, lin23, lin23_df
, lin24, lin24_df
, lin34, lin34_df)
#================
# DRUG mutations
#=================
head(df_lin_dr$lineage)
df_lin_dr$lineage = as.character(df_lin_dr$lineage)
lin1_dr = df_lin_dr[df_lin_dr$lineage == "lineage1",]$duet_scaled
lin2_dr = df_lin_dr[df_lin_dr$lineage == "lineage2",]$duet_scaled
lin3_dr = df_lin_dr[df_lin_dr$lineage == "lineage3",]$duet_scaled
lin4_dr = df_lin_dr[df_lin_dr$lineage == "lineage4",]$duet_scaled
# ks test: dr muts
lin12_dr = ks.test(lin1_dr,lin2_dr)
lin12_df_dr = as.data.frame(cbind(lin12_dr$data.name, lin12_dr$p.value))
lin13_dr = ks.test(lin1_dr,lin3_dr)
lin13_df_dr = as.data.frame(cbind(lin13_dr$data.name, lin13_dr$p.value))
lin14_dr = ks.test(lin1_dr,lin4_dr)
lin14_df_dr = as.data.frame(cbind(lin14_dr$data.name, lin14_dr$p.value))
lin23_dr = ks.test(lin2_dr,lin3_dr)
lin23_df_dr = as.data.frame(cbind(lin23_dr$data.name, lin23_dr$p.value))
lin24_dr = ks.test(lin2_dr,lin4_dr)
lin24_df_dr = as.data.frame(cbind(lin24_dr$data.name, lin24_dr$p.value))
lin34_dr = ks.test(lin3_dr,lin4_dr)
lin34_df_dr = as.data.frame(cbind(lin34_dr$data.name, lin34_dr$p.value))
ks_results_dr = rbind(lin12_df_dr
, lin13_df_dr
, lin14_df_dr
, lin23_df_dr
, lin24_df_dr
, lin34_df_dr)
ks_results_combined = cbind(ks_results_all, ks_results_dr)
my_colnames = c("Lineage_comparisons"
, paste0("All_mutations n=", nrow(df_lin))
, paste0("Drug_associated_mutations n=", nrow(df_lin_dr)))
my_colnames
# select the output columns
ks_results_combined_f = ks_results_combined[,c(1,2,4)]
colnames(ks_results_combined_f) = my_colnames
ks_results_combined_f
#=============
# write output file
#=============
ks_results = paste0(outdir,"/results/ks_results.csv")
write.csv(ks_results_combined_f, ks_results, row.names = F)

View file

@ -104,7 +104,7 @@ or_df.columns
#%% snp_info file: master and gene specific ones #%% snp_info file: master and gene specific ones
# gene info # gene info
info_df2 = pd.read_csv(gene_info, sep = '\t', header = 0) #447, 10 info_df2 = pd.read_csv(gene_info, sep = '\t', header = 0) #447, 11
#info_df2 = pd.read_csv(gene_info, sep = ',', header = 0) #447 10 #info_df2 = pd.read_csv(gene_info, sep = ',', header = 0) #447 10
mis_mut_cover = (info_df2['chromosome_number'].nunique()/info_df2['chromosome_number'].count()) * 100 mis_mut_cover = (info_df2['chromosome_number'].nunique()/info_df2['chromosome_number'].count()) * 100
print('*****RESULT*****' print('*****RESULT*****'
@ -212,7 +212,7 @@ else:
#PENDING Jody's reply #PENDING Jody's reply
# !!!!!!!! # !!!!!!!!
# drop nan from dfm2_mis as these are not useful # drop nan from dfm2_mis as these are not useful and JP confirmed the same
print('Dropping NAs before further processing...') print('Dropping NAs before further processing...')
dfm2_mis = dfm2[dfm2['mut_type'].notnull()] dfm2_mis = dfm2[dfm2['mut_type'].notnull()]
# !!!!!!!! # !!!!!!!!

View file

@ -1,45 +0,0 @@
#!/usr/bin/env python3
#=======================================================================
#%% useful links
#https://towardsdatascience.com/autoviz-automatically-visualize-any-dataset-ba2691a8b55a
#https://pypi.org/project/autoviz/
#=======================================================================
import os, sys
import pandas as pd
import numpy as np
import re
import argparse
from autoviz.AutoViz_Class import AutoViz_Class
homedir = os.path.expanduser('~')
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
#%%============================================================================
# variables
gene = 'pncA'
drug = 'pyrazinamide'
#%%============================================================================
#==============
# directories
#==============
datadir = homedir + '/' + 'git/Data'
indir = datadir + '/' + drug + '/input'
outdir = datadir + '/' + drug + '/output'
#=======
# input
#=======
in_filename_plotting = 'car_design.csv'
in_filename_plotting = gene.lower() + '_all_params.csv'
infile_plotting = outdir + '/' + in_filename_plotting
print('plotting file: ', infile_plotting
, '\n============================================================')
#=======================================================================
plotting_df = pd.read_csv(infile_plotting, sep = ',')
#Instantiate the AutoViz class
AV = AutoViz_Class()
df = AV.AutoViz(infile_plotting)
#df2 = AV.AutoViz(plotting_df)
plotting_df.columns[~plotting_df.columns.isin(df.columns)]

View file

@ -1,5 +1,4 @@
#!/usr/bin/env Rscript #!/usr/bin/env Rscript getwd()
getwd()
setwd("~/git/LSHTM_analysis/scripts/plotting") setwd("~/git/LSHTM_analysis/scripts/plotting")
getwd() getwd()
@ -19,8 +18,7 @@ getwd()
library(ggplot2) library(ggplot2)
library(data.table) library(data.table)
source("barplot_colour_function.R") source("barplot_colour_function.R")
#source("subcols_axis.R") source("subcols_axis.R")
source("subcols_axis_PS.R")
# should return the following dfs, directories and variables # should return the following dfs, directories and variables
# mut_pos_cols # mut_pos_cols
@ -161,9 +159,7 @@ min(df$duet_scaled)
max(df$duet_scaled) max(df$duet_scaled)
# sanity checks # sanity checks
# very important!!!!
tapply(df$duet_scaled, df$duet_outcome, min) tapply(df$duet_scaled, df$duet_outcome, min)
tapply(df$duet_scaled, df$duet_outcome, max) tapply(df$duet_scaled, df$duet_outcome, max)
# My colour FUNCTION: based on group and subgroup # My colour FUNCTION: based on group and subgroup
@ -241,7 +237,7 @@ outPlot = g +
, axis.ticks.x = element_blank()) + , axis.ticks.x = element_blank()) +
labs(title = "" labs(title = ""
#title = my_title #title = my_title
, x = "position" , x = "Position"
, y = "Frequency") , y = "Frequency")
print(outPlot) print(outPlot)

View file

@ -1,87 +0,0 @@
,x,,changes,
1,mutationinformation,,Mutationinformation,
2,wild_type,,,consider...wild_aa
3,position,,Position,
4,mutant_type,,,consider...mutant_aa
5,chain,,,
6,ligand_id,,,
7,ligand_distance,,,
8,duet_stability_change,,,
9,duet_outcome,,DUET_outcome,
10,ligand_affinity_change,,,
11,ligand_outcome,,Lig_outcome,
12,duet_scaled,,ratioDUET,
13,affinity_scaled,,ratioPredAff,
14,wild_pos,,WildPos,
15,wild_chain_pos,,,
16,ddg,,,
17,contacts,,,
18,electro_rr,,,
19,electro_mm,,,
20,electro_sm,,,
21,electro_ss,,,
22,disulfide_rr,,,
23,disulfide_mm,,,
24,disulfide_sm,,,
25,disulfide_ss,,,
26,hbonds_rr,,,
27,hbonds_mm,,,
28,hbonds_sm,,,
29,hbonds_ss,,,
30,partcov_rr,,,
31,partcov_mm,,,
32,partcov_sm,,,
33,partcov_ss,,,
34,vdwclashes_rr,,,
35,vdwclashes_mm,,,
36,vdwclashes_sm,,,
37,vdwclashes_ss,,,
38,volumetric_rr,,,
39,volumetric_mm,,,
40,volumetric_sm,,,
41,volumetric_ss,,,
42,wild_type_dssp,,,
43,asa,,,
44,rsa,,,
45,ss,,,
46,ss_class,,,
47,chain_id,,,
48,wild_type_kd,,,
49,kd_values,,,
50,rd_values,,,
51,wt_3letter_caps,,,
52,mutation,,,
53,af,,,
54,beta_logistic,,,
55,or_logistic,,,
56,pval_logistic,,,
57,se_logistic,,,
58,zval_logistic,,,
59,ci_low_logistic,,,
60,ci_hi_logistic,,,
61,or_mychisq,,,
62,or_fisher,,,
63,pval_fisher,,,
64,ci_low_fisher,,,
65,ci_hi_fisher,,,
66,est_chisq,,,
67,pval_chisq,,,
68,chromosome_number,,,
69,ref_allele,,,
70,alt_allele,,,
71,mut_type,,,
72,gene_id,,,
73,gene_number,,,
74,mut_region,,,
75,mut_info,,,
76,chr_num_allele,,,
77,wt_3let,,,
78,mt_3let,,,
79,af_kin,,,
80,or_kin,,,
81,pwald_kin,,,
82,beta_kin,,,
83,se_kin,,,
84,logl_h1_kin,,,
85,l_remle_kin,,,
86,n_miss,,,
1 x changes
2 1 mutationinformation Mutationinformation
3 2 wild_type consider...wild_aa
4 3 position Position
5 4 mutant_type consider...mutant_aa
6 5 chain
7 6 ligand_id
8 7 ligand_distance
9 8 duet_stability_change
10 9 duet_outcome DUET_outcome
11 10 ligand_affinity_change
12 11 ligand_outcome Lig_outcome
13 12 duet_scaled ratioDUET
14 13 affinity_scaled ratioPredAff
15 14 wild_pos WildPos
16 15 wild_chain_pos
17 16 ddg
18 17 contacts
19 18 electro_rr
20 19 electro_mm
21 20 electro_sm
22 21 electro_ss
23 22 disulfide_rr
24 23 disulfide_mm
25 24 disulfide_sm
26 25 disulfide_ss
27 26 hbonds_rr
28 27 hbonds_mm
29 28 hbonds_sm
30 29 hbonds_ss
31 30 partcov_rr
32 31 partcov_mm
33 32 partcov_sm
34 33 partcov_ss
35 34 vdwclashes_rr
36 35 vdwclashes_mm
37 36 vdwclashes_sm
38 37 vdwclashes_ss
39 38 volumetric_rr
40 39 volumetric_mm
41 40 volumetric_sm
42 41 volumetric_ss
43 42 wild_type_dssp
44 43 asa
45 44 rsa
46 45 ss
47 46 ss_class
48 47 chain_id
49 48 wild_type_kd
50 49 kd_values
51 50 rd_values
52 51 wt_3letter_caps
53 52 mutation
54 53 af
55 54 beta_logistic
56 55 or_logistic
57 56 pval_logistic
58 57 se_logistic
59 58 zval_logistic
60 59 ci_low_logistic
61 60 ci_hi_logistic
62 61 or_mychisq
63 62 or_fisher
64 63 pval_fisher
65 64 ci_low_fisher
66 65 ci_hi_fisher
67 66 est_chisq
68 67 pval_chisq
69 68 chromosome_number
70 69 ref_allele
71 70 alt_allele
72 71 mut_type
73 72 gene_id
74 73 gene_number
75 74 mut_region
76 75 mut_info
77 76 chr_num_allele
78 77 wt_3let
79 78 mt_3let
80 79 af_kin
81 80 or_kin
82 81 pwald_kin
83 82 beta_kin
84 83 se_kin
85 84 logl_h1_kin
86 85 l_remle_kin
87 86 n_miss

View file

@ -6,6 +6,7 @@
# 2) <gene>_meta_data.csv # 2) <gene>_meta_data.csv
# Output: # Output:
# 1) muts with opposite effects on stability
# 2) large combined df including NAs for AF, OR,etc # 2) large combined df including NAs for AF, OR,etc
# Dim: same no. of rows as gene associated meta_data_with_AFandOR # Dim: same no. of rows as gene associated meta_data_with_AFandOR
# 3) small combined df including NAs for AF, OR, etc. # 3) small combined df including NAs for AF, OR, etc.
@ -36,17 +37,23 @@ source("plotting_data.R")
# dup_muts # dup_muts
cat("Directories imported:" cat("Directories imported:"
, "\n===================="
, "\ndatadir:", datadir , "\ndatadir:", datadir
, "\nindir:", indir , "\nindir:", indir
, "\noutdir:", outdir , "\noutdir:", outdir
, "\nplotdir:", plotdir) , "\nplotdir:", plotdir)
cat("Variables imported:" cat("Variables imported:"
, "\n====================="
, "\ndrug:", drug , "\ndrug:", drug
, "\ngene:", gene , "\ngene:", gene
, "\ngene_match:", gene_match , "\ngene_match:", gene_match
, "\nLength of upos:", length(upos) , "\nAngstrom symbol:", angstroms_symbol
, "\nAngstrom symbol:", angstroms_symbol) , "\nNo. of duplicated muts:", dup_muts_nu
, "\ndr_muts_col:", dr_muts_col
, "\nother_muts_col:", other_muts_col
, "\ndrtype_col:", resistance_col)
# clear excess variable # clear excess variable
rm(my_df, upos, dup_muts) rm(my_df, upos, dup_muts)
@ -57,7 +64,6 @@ rm(my_df, upos, dup_muts)
#in_file1: output of plotting_data.R #in_file1: output of plotting_data.R
# my_df_u # my_df_u
# infile 2: gene associated meta data # infile 2: gene associated meta data
#in_filename_gene_metadata = paste0(tolower(gene), "_meta_data_with_AFandOR.csv") #in_filename_gene_metadata = paste0(tolower(gene), "_meta_data_with_AFandOR.csv")
in_filename_gene_metadata = paste0(tolower(gene), "_metadata.csv") in_filename_gene_metadata = paste0(tolower(gene), "_metadata.csv")
@ -94,6 +100,8 @@ gene_metadata <- read.csv(infile_gene_metadata
, header = T) , header = T)
cat("Dim:", dim(gene_metadata)) cat("Dim:", dim(gene_metadata))
table(gene_metadata$mutation_info)
# counting NAs in AF, OR cols # counting NAs in AF, OR cols
# or_mychisq # or_mychisq
@ -124,6 +132,7 @@ if (identical(sum(is.na(my_df_u$or_kin))
, "\nNA in AF:", sum(is.na(my_df_u$af_kin))) , "\nNA in AF:", sum(is.na(my_df_u$af_kin)))
} }
str(gene_metadata)
################################################################### ###################################################################
# combining: PS # combining: PS
@ -145,7 +154,7 @@ merging_cols = intersect(colnames(my_df_u), colnames(gene_metadata))
cat(paste0("Merging dfs with NAs: big df (1-many relationship b/w id & mut)" cat(paste0("Merging dfs with NAs: big df (1-many relationship b/w id & mut)"
, "\nNo. of merging cols:", length(merging_cols) , "\nNo. of merging cols:", length(merging_cols)
, "\nMerging columns identified:\n")) , "\nMerging columns identified:"))
print(merging_cols) print(merging_cols)
# important checks! # important checks!
@ -160,7 +169,7 @@ merged_df2 = merge(x = gene_metadata
, by = merging_cols , by = merging_cols
, all.y = T) , all.y = T)
cat("Dim of merged_df2: ", dim(merged_df2), "\n") cat("Dim of merged_df2: ", dim(merged_df2))
head(merged_df2$position) head(merged_df2$position)
# sanity check # sanity check
@ -170,10 +179,10 @@ if(nrow(gene_metadata) == nrow(merged_df2)){
,"\nExpected no. of rows: ",nrow(gene_metadata) ,"\nExpected no. of rows: ",nrow(gene_metadata)
,"\nGot no. of rows: ", nrow(merged_df2)) ,"\nGot no. of rows: ", nrow(merged_df2))
} else{ } else{
cat("\nFAIL: nrow(merged_df2)!= nrow(gene associated gene_metadata)" cat("FAIL: nrow(merged_df2)!= nrow(gene associated gene_metadata)"
, "\nExpected no. of rows after merge: ", nrow(gene_metadata) , "\nExpected no. of rows after merge: ", nrow(gene_metadata)
, "\nGot no. of rows: ", nrow(merged_df2) , "\nGot no. of rows: ", nrow(merged_df2)
, "\nFinding discrepancy\n") , "\nFinding discrepancy")
merged_muts_u = unique(merged_df2$mutationinformation) merged_muts_u = unique(merged_df2$mutationinformation)
meta_muts_u = unique(gene_metadata$mutationinformation) meta_muts_u = unique(gene_metadata$mutationinformation)
# find the index where it differs # find the index where it differs
@ -227,6 +236,7 @@ if (identical(sum(is.na(merged_df3$or_kin))
, "\nNA in AF:", sum(is.na(merged_df3$af_kin))) , "\nNA in AF:", sum(is.na(merged_df3$af_kin)))
} }
#========================= #=========================
# Merge3: merged_df2_comp # Merge3: merged_df2_comp
# same as merge 1 but excluding NAs from ORs, etc. # same as merge 1 but excluding NAs from ORs, etc.
@ -237,6 +247,7 @@ cat("Merging dfs without any NAs: big df (1-many relationship b/w id & mut)"
na_count_df2 = sum(is.na(merged_df2$af)) na_count_df2 = sum(is.na(merged_df2$af))
merged_df2_comp = merged_df2[!is.na(merged_df2$af),] merged_df2_comp = merged_df2[!is.na(merged_df2$af),]
# sanity check: no +-1 gymnastics # sanity check: no +-1 gymnastics
cat("Checking nrows in merged_df2_comp") cat("Checking nrows in merged_df2_comp")
if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){ if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){
@ -250,16 +261,18 @@ if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){
,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2 ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
,"\nGot no. of rows: ", nrow(merged_df2_comp)) ,"\nGot no. of rows: ", nrow(merged_df2_comp))
} }
#========================= #=========================
# Merge4: merged_df3_comp # Merge4: merged_df3_comp
# same as merge 2 but excluding NAs from ORs, etc or # same as merge 2 but excluding NAs from ORs, etc or
# remove duplicate mutation information # remove duplicate mutation information
#========================= #=========================
na_count_df3 = sum(is.na(merged_df3$af)) na_count_df3 = sum(is.na(merged_df3$af))
#merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way #merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way
merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way
cat("\nChecking nrows in merged_df3_comp") cat("Checking nrows in merged_df3_comp")
if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){ if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){
cat("\nPASS: No. of rows match" cat("\nPASS: No. of rows match"
,"\nDim of merged_df3_comp: " ,"\nDim of merged_df3_comp: "
@ -267,12 +280,11 @@ if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){
, "\nNo. of rows: ", nrow(merged_df3_comp) , "\nNo. of rows: ", nrow(merged_df3_comp)
, "\nNo. of cols: ", ncol(merged_df3_comp)) , "\nNo. of cols: ", ncol(merged_df3_comp))
}else{ }else{
cat("\nFAIL: No. of rows mismatch" cat("FAIL: No. of rows mismatch"
,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3 ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
,"\nGot no. of rows: ", nrow(merged_df3_comp)) ,"\nGot no. of rows: ", nrow(merged_df3_comp))
} }
# alternate way of deriving merged_df3_comp # alternate way of deriving merged_df3_comp
foo = merged_df3[!is.na(merged_df3$af),] foo = merged_df3[!is.na(merged_df3$af),]
bar = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] bar = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),]
@ -307,7 +319,8 @@ all.equal(foo, bar)
# clear variables # clear variables
rm(foo, bar, gene_metadata rm(foo, bar, gene_metadata
, in_filename_params, infile_params, merging_cols , in_filename_params, infile_params, merging_cols
, in_filename_gene_metadata, infile_gene_metadata) , in_filename_gene_metadata, infile_gene_metadata
, merged_df2v2, merged_df2v3)
#************************* #*************************
##################################################################### #####################################################################
# Combining: LIG # Combining: LIG
@ -334,4 +347,4 @@ if (nrow(merged_df3_lig) == nrow(my_df_u_lig)){
#========================================================================== #==========================================================================
# end of script # end of script
##========================================================================= ##==========================================================================

0
scripts/plotting/combining_two_df_FIXME.R Executable file → Normal file
View file

View file

@ -0,0 +1,263 @@
#!/usr/bin/env Rscript
#########################################################
# TASK: Corr plots for PS and Lig
# Output: 1 svg
#=======================================================================
# working dir and loading libraries
getwd()
setwd("~/git/LSHTM_analysis/scripts/plotting/")
getwd()
source("Header_TT.R")
require(cowplot)
source("combining_dfs_plotting.R")
# should return the following dfs, directories and variables
# PS combined:
# 1) merged_df2
# 2) merged_df2_comp
# 3) merged_df3
# 4) merged_df3_comp
# LIG combined:
# 5) merged_df2_lig
# 6) merged_df2_comp_lig
# 7) merged_df3_lig
# 8) merged_df3_comp_lig
# 9) my_df_u
# 10) my_df_u_lig
cat(paste0("Directories imported:"
, "\ndatadir:", datadir
, "\nindir:", indir
, "\noutdir:", outdir
, "\nplotdir:", plotdir))
cat(paste0("Variables imported:"
, "\ndrug:", drug
, "\ngene:", gene
, "\ngene_match:", gene_match
, "\nAngstrom symbol:", angstroms_symbol
, "\nNo. of duplicated muts:", dup_muts_nu
, "\nNA count for ORs:", na_count
, "\nNA count in df2:", na_count_df2
, "\nNA count in df3:", na_count_df3))
#=======
# output
#=======
# can't combine by cowplot because not ggplots
#corr_plot_combined = "corr_combined.svg"
#plot_corr_plot_combined = paste0(plotdir,"/", corr_plot_combined)
# PS
corr_ps = "corr_PS.svg"
plot_corr_ps = paste0(plotdir,"/", corr_ps)
# LIG
corr_lig = "corr_LIG.svg"
plot_corr_lig = paste0(plotdir,"/", corr_lig)
####################################################################
# end of loading libraries and functions #
########################################################################
#%%%%%%%%%%%%%%%%%%%%%%%%%
df_ps = merged_df3_comp
df_lig = merged_df3_comp_lig
#%%%%%%%%%%%%%%%%%%%%%%%%%
rm( merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig, my_df_u, my_df_u_lig)
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#===========================
# Data for Correlation plots:PS
#===========================
table(df_ps$duet_outcome)
# adding log cols
df_ps$log10_or_mychisq = log10(df_ps$or_mychisq)
df_ps$neglog_pval_fisher = -log10(df_ps$pval_fisher)
df_ps$log10_or_kin = log10(df_ps$or_kin)
df_ps$neglog_pwald_kin = -log10(df_ps$pwald_kin)
# subset data to generate pairwise correlations
cols_to_select = c("duet_scaled"
, "log10_or_mychisq"
, "neglog_pval_fisher"
, "log10_or_kin"
, "neglog_pwald_kin"
, "af"
, "duet_outcome"
, drug)
corr_data_ps = df_ps[, cols_to_select]
dim(corr_data_ps)
#p_italic = substitute(paste("-Log(", italic('P'), ")"));p_italic
#p_adjusted_italic = substitute(paste("-Log(", italic('P adjusted'), ")"));p_adjusted_italic
# assign nice colnames (for display)
my_corr_colnames = c("DUET"
, "Log(OR)"
, "-Log(P)"
, "Log(OR adjusted)"
, "-Log(P wald)"
, "AF"
, "duet_outcome"
, drug)
length(my_corr_colnames)
colnames(corr_data_ps)
colnames(corr_data_ps) <- my_corr_colnames
colnames(corr_data_ps)
#-----------------
# generate corr PS plot
#-----------------
start = 1
end = which(colnames(corr_data_ps) == drug); end # should be the last column
offset = 1
my_corr_ps = corr_data_ps[start:(end-offset)]
head(my_corr_ps)
#my_cols = c("#f8766d", "#00bfc4")
# deep blue :#007d85
# deep red: #ae301e
cat("Corr plot PS:", plot_corr_ps)
svg(plot_corr_ps, width = 15, height = 15)
OutPlot1 = pairs.panels(my_corr_ps[1:(length(my_corr_ps)-1)]
, method = "spearman" # correlation method
, hist.col = "grey" ##00AFBB
, density = TRUE # show density plots
, ellipses = F # show correlation ellipses
, stars = T
, rug = F
, breaks = "Sturges"
, show.points = T
, bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr_ps$duet_outcome))]
, pch = 21
, jitter = T
#, alpha = .05
#, points(pch = 19, col = c("#f8766d", "#00bfc4"))
, cex = 3
, cex.axis = 2.5
, cex.labels = 2.1
, cex.cor = 1
, smooth = F
)
print(OutPlot1)
dev.off()
#===========================
# Data for Correlation plots: LIG
#===========================
table(df_lig$ligand_outcome)
df_lig$log10_or_mychisq = log10(df_lig$or_mychisq)
df_lig$neglog_pval_fisher = -log10(df_lig$pval_fisher)
df_lig$log10_or_kin = log10(df_lig$or_kin)
df_lig$neglog_pwald_kin = -log10(df_lig$pwald_kin)
# subset data to generate pairwise correlations
cols_to_select = c("affinity_scaled"
, "log10_or_mychisq"
, "neglog_pval_fisher"
, "log10_or_kin"
, "neglog_pwald_kin"
, "af"
, "ligand_outcome"
, drug)
corr_data_lig = df_lig[, cols_to_select]
dim(corr_data_lig)
# assign nice colnames (for display)
my_corr_colnames = c("Ligand Affinity"
, "Log(OR)"
, "-Log(P)"
, "Log(OR adjusted)"
, "-Log(P wald)"
, "AF"
, "ligand_outcome"
, drug)
length(my_corr_colnames)
colnames(corr_data_lig)
colnames(corr_data_lig) <- my_corr_colnames
colnames(corr_data_lig)
#-----------------
# generate corr LIG plot
#-----------------
start = 1
end = which(colnames(corr_data_lig) == drug); end # should be the last column
offset = 1
my_corr_lig = corr_data_lig[start:(end-offset)]
head(my_corr_lig)
cat("Corr LIG plot:", plot_corr_lig)
svg(plot_corr_lig, width = 15, height = 15)
OutPlot2 = pairs.panels(my_corr_lig[1:(length(my_corr_lig)-1)]
, method = "spearman" # correlation method
, hist.col = "grey" ##00AFBB
, density = TRUE # show density plots
, ellipses = F # show correlation ellipses
, stars = T
, rug = F
, breaks = "Sturges"
, show.points = T
, bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr_lig$ligand_outcome))]
, pch = 21
, jitter = T
#, alpha = .05
#, points(pch = 19, col = c("#f8766d", "#00bfc4"))
, cex = 3
, cex.axis = 2.5
, cex.labels = 2.1
, cex.cor = 1
, smooth = F
)
print(OutPlot2)
dev.off()
#######################################################

56
scripts/plotting/dirs.R Normal file
View file

@ -0,0 +1,56 @@
#!/usr/bin/env Rscript
#########################################################
# TASK: formatting data that will be used for various plots
# useful links
#https://stackoverflow.com/questions/38851592/r-append-column-in-a-dataframe-with-frequency-count-based-on-two-columns
#########################################################
# working dir and loading libraries
getwd()
setwd("~/git/LSHTM_analysis/scripts/plotting")
getwd()
#source("Header_TT.R")
library(ggplot2)
library(data.table)
library(dplyr)
require("getopt", quietly = TRUE) #cmd parse arguments
#========================================================
# command line args
#spec = matrix(c(
# "drug" , "d", 1, "character",
# "gene" , "g", 1, "character"
#), byrow = TRUE, ncol = 4)
#opt = getopt(spec)
#drug = opt$druggene = opt$gene
#if(is.null(drug)|is.null(gene)) {
# stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
#}
#========================================================
# FIXME: change to cmd
#%% variable assignment: input and output paths & filenames
drug = "pyrazinamide"
gene = "pncA"
gene_match = paste0(gene,"_p.")
cat(gene_match)
#=============
# directories and variables
#=============
datadir = paste0("~/git/Data")
indir = paste0(datadir, "/", drug, "/input")
outdir = paste0("~/git/Data", "/", drug, "/output")
plotdir = paste0("~/git/Data", "/", drug, "/output/plots")
dr_muts_col = paste0('dr_mutations_', drug)
other_muts_col = paste0('other_mutations_', drug)
resistance_col = "drtype"
#%%===============================================================

View file

@ -0,0 +1,214 @@
#!/usr/bin/env Rscript
getwd()
setwd("~/git/LSHTM_analysis/scripts/plotting/")
getwd()
#########################################################
# TASK: Basic lineage barplot showing numbers
# Output: Basic barplot with lineage samples and mut count
##########################################################
# Installing and loading required packages
##########################################################
source("Header_TT.R")
require(data.table)
source("combining_dfs_plotting.R")
# should return the following dfs, directories and variables
# PS combined:
# 1) merged_df2
# 2) merged_df2_comp
# 3) merged_df3
# 4) merged_df3_comp
# LIG combined:
# 5) merged_df2_lig
# 6) merged_df2_comp_lig
# 7) merged_df3_lig
# 8) merged_df3_comp_lig
# 9) my_df_u
# 10) my_df_u_lig
cat("Directories imported:"
, "\n===================="
, "\ndatadir:", datadir
, "\nindir:", indir
, "\noutdir:", outdir
, "\nplotdir:", plotdir)
cat("Variables imported:"
, "\n====================="
, "\ndrug:", drug
, "\ngene:", gene
, "\ngene_match:", gene_match
, "\nAngstrom symbol:", angstroms_symbol
, "\nNo. of duplicated muts:", dup_muts_nu
, "\nNA count for ORs:", na_count
, "\nNA count in df2:", na_count_df2
, "\nNA count in df3:", na_count_df3
, "\ndr_muts_col:", dr_muts_col
, "\nother_muts_col:", other_muts_col
, "\ndrtype_col:", resistance_col)
#===========
# input
#===========
# output of combining_dfs_plotting.R
#=======
# output
#=======
# plot 1
basic_bp_lineage = "basic_lineage_barplot.svg"
plot_basic_bp_lineage = paste0(plotdir,"/", basic_bp_lineage)
#=======================================================================
#================
# Data for plots:
# you need merged_df2, comprehensive one
# since this has one-many relationship
# i.e the same SNP can belong to multiple lineages
#================
# REASSIGNMENT as necessary
my_df = merged_df2
# clear excess variable
rm(merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
is.factor(my_df$lineage)
my_df$lineage = as.factor(my_df$lineage)
is.factor(my_df$lineage)
#==========================
# Plot: Lineage barplot
# x = lineage y = No. of samples
# col = Lineage
# fill = lineage
#============================
table(my_df$lineage)
as.data.frame(table(my_df$lineage))
#=============
# Data for plots
#=============
# REASSIGNMENT
df <- my_df
rm(my_df)
# get freq count of positions so you can subset freq<1
#setDT(df)[, lineage_count := .N, by = .(lineage)]
#******************
# generate plot: barplot of mutation by lineage
#******************
sel_lineages = c("lineage1"
, "lineage2"
, "lineage3"
, "lineage4"
#, "lineage5"
#, "lineage6"
#, "lineage7"
)
df_lin = subset(df, subset = lineage %in% sel_lineages)
# Create df with lineage inform & no. of unique mutations
# per lineage and total samples within lineage
# this is essentially barplot with two y axis
bar = bar = as.data.frame(sel_lineages) #4, 1
total_snps_u = NULL
total_samples = NULL
for (i in sel_lineages){
#print(i)
curr_total = length(unique(df$id)[df$lineage==i])
total_samples = c(total_samples, curr_total)
print(total_samples)
foo = df[df$lineage==i,]
print(paste0(i, "======="))
print(length(unique(foo$mutationinformation)))
curr_count = length(unique(foo$mutationinformation))
total_snps_u = c(total_snps_u, curr_count)
}
print(total_snps_u)
bar$num_snps_u = total_snps_u
bar$total_samples = total_samples
bar
#*****************
# generate plot: lineage barplot with two y-axis
#https://stackoverflow.com/questions/13035295/overlay-bar-graphs-in-ggplot2
#*****************
y1 = bar$num_snps_u
y2 = bar$total_samples
x = sel_lineages
to_plot = data.frame(x = x
, y1 = y1
, y2 = y2)
to_plot
# FIXME later: will be depricated!
melted = melt(to_plot, id = "x")
melted
svg(plot_basic_bp_lineage)
my_ats = 20 # axis text size
my_als = 22 # axis label size
g = ggplot(melted, aes(x = x
, y = value
, fill = variable))
printFile = g + geom_bar(stat = "identity"
, position = position_stack(reverse = TRUE)
, alpha=.75
, colour='grey75') +
theme(axis.text.x = element_text(size = my_ats)
, axis.text.y = element_text(size = my_ats
#, angle = 30
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_als
, colour = 'black')
, axis.title.y = element_text(size = my_als
, colour = 'black')
, legend.position = "top"
, legend.text = element_text(size = my_als)) +
#geom_text() +
geom_label(aes(label = value)
, size = 5
, hjust = 0.5
, vjust = 0.5
, colour = 'black'
, show.legend = FALSE
#, check_overlap = TRUE
, position = position_stack(reverse = T)) +
labs(title = ''
, x = ''
, y = "Number"
, fill = 'Variable'
, colour = 'black') +
scale_fill_manual(values = c('grey50', 'gray75')
, name=''
, labels=c('Mutations', 'Total Samples')) +
scale_x_discrete(breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
, labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4'))
print(printFile)
dev.off()

View file

@ -0,0 +1,301 @@
#!/usr/bin/env Rscript
#########################################################
# TASK: Lineage dist plots: ggridges
# Output: 2 SVGs for PS stability
# 1) all muts
# 2) dr_muts
##########################################################
# Installing and loading required packages
##########################################################
getwd()
setwd("~/git/LSHTM_analysis/scripts/plotting/")
getwd()
source("Header_TT.R")
library(ggridges)
source("combining_dfs_plotting.R")
# PS combined:
# 1) merged_df2
# 2) merged_df2_comp
# 3) merged_df3
# 4) merged_df3_comp
# LIG combined:
# 5) merged_df2_lig
# 6) merged_df2_comp_lig
# 7) merged_df3_lig
# 8) merged_df3_comp_lig
# 9) my_df_u
# 10) my_df_u_lig
cat("Directories imported:"
, "\n===================="
, "\ndatadir:", datadir
, "\nindir:", indir
, "\noutdir:", outdir
, "\nplotdir:", plotdir)
cat("Variables imported:"
, "\n====================="
, "\ndrug:", drug
, "\ngene:", gene
, "\ngene_match:", gene_match
, "\nAngstrom symbol:", angstroms_symbol
, "\nNo. of duplicated muts:", dup_muts_nu
, "\nNA count for ORs:", na_count
, "\nNA count in df2:", na_count_df2
, "\nNA count in df3:", na_count_df3
, "\ndr_muts_col:", dr_muts_col
, "\nother_muts_col:", other_muts_col
, "\ndrtype_col:", resistance_col)
#=======
# output
#=======
lineage_dist_combined = "lineage_dist_combined_PS.svg"
plot_lineage_dist_combined = paste0(plotdir,"/", lineage_dist_combined)
#========================================================================
###########################
# Data for plots
# you need merged_df2 or merged_df2_comp
# since this is one-many relationship
# i.e the same SNP can belong to multiple lineages
# using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available, hence use df with NA
###########################
# REASSIGNMENT
my_df = merged_df2
# delete variables not required
rm(my_df_u, merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
is.factor(my_df$lineage)
my_df$lineage = as.factor(my_df$lineage)
is.factor(my_df$lineage)
table(my_df$mutation_info)
# subset df with dr muts only
my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide")
table(my_df_dr$mutation_info)
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#==========================
# Plot 1: ALL Muts
# x = mcsm_values, y = dist
# fill = stability
#============================
my_plot_name = 'lineage_dist_PS.svg'
plot_lineage_duet = paste0(plotdir,"/", my_plot_name)
#===================
# Data for plots
#===================
table(my_df$lineage); str(my_df$lineage)
# subset only lineages1-4
sel_lineages = c("lineage1"
, "lineage2"
, "lineage3"
, "lineage4"
#, "lineage5"
#, "lineage6"
#, "lineage7"
)
# uncomment as necessary
df_lin = subset(my_df, subset = lineage %in% sel_lineages )
table(df_lin$lineage)
# refactor
df_lin$lineage = factor(df_lin$lineage)
sum(table(df_lin$lineage)) #{RESULT: Total number of samples for lineage}
table(df_lin$lineage)#{RESULT: No of samples within lineage}
length(unique(df_lin$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to}
length(df_lin$mutationinformation)
u2 = unique(my_df$mutationinformation)
u = unique(df_lin$mutationinformation)
check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages}
#%%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
df <- df_lin
#%%%%%%%%%%%%%%%%%%%%%%%%%
rm(df_lin)
#******************
# generate distribution plot of lineages
#******************
# 2 : ggridges (good!)
my_ats = 15 # axis text size
my_als = 20 # axis label size
my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4'
#, 'Lineage 5', 'Lineage 6', 'Lineage 7'
)
names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4'
# , 'lineage5', 'lineage6', 'lineage7'
)
# check plot name
plot_lineage_duet
# output svg
#svg(plot_lineage_duet)
p1 = ggplot(df, aes(x = duet_scaled
, y = duet_outcome))+
#printFile=geom_density_ridges_gradient(
geom_density_ridges_gradient(aes(fill = ..x..)
, scale = 3
, size = 0.3 ) +
facet_wrap( ~lineage
, scales = "free"
#, switch = 'x'
, labeller = labeller(lineage = my_labels) ) +
coord_cartesian( xlim = c(-1, 1)) +
scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
, name = "DUET" ) +
theme(axis.text.x = element_text(size = my_ats
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_blank()
, axis.title.x = element_blank()
, axis.title.y = element_blank()
, axis.ticks.y = element_blank()
, plot.title = element_blank()
, strip.text = element_text(size = my_als)
, legend.text = element_text(size = my_als-5)
, legend.title = element_text(size = my_als)
)
print(p1)
#dev.off()
#######################################################################
# lineage distribution plot for dr_muts
#######################################################################
#==========================
# Plot 2: dr muts ONLY
# x = mcsm_values, y = dist
# fill = stability
#============================
my_plot_name_dr = 'lineage_dist_dr_muts_PS.svg'
plot_lineage_dr_duet = paste0(plotdir,"/", my_plot_name_dr)
#===================
# Data for plots
#===================
table(my_df_dr$lineage); str(my_df_dr$lineage)
# uncomment as necessary
df_lin_dr = subset(my_df_dr, subset = lineage %in% sel_lineages)
table(df_lin_dr$lineage)
# refactor
df_lin_dr$lineage = factor(df_lin_dr$lineage)
sum(table(df_lin_dr$lineage)) #{RESULT: Total number of samples for lineage}
table(df_lin_dr$lineage)#{RESULT: No of samples within lineage}
length(unique(df_lin_dr$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to}
length(df_lin_dr$mutationinformation)
u2 = unique(my_df_dr$mutationinformation)
u = unique(df_lin_dr$mutationinformation)
check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages}
#%%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
df_dr <- df_lin_dr
#%%%%%%%%%%%%%%%%%%%%%%%%%
rm(df_lin_dr)
#******************
# generate distribution plot of lineages
#******************
# 2 : ggridges (good!)
my_ats = 15 # axis text size
my_als = 20 # axis label size
# check plot name
plot_lineage_dr_duet
# output svg
#svg(plot_lineage_dr_duet)
p2 = ggplot(df_dr, aes(x = duet_scaled
, y = duet_outcome))+
#printFile=geom_density_ridges_gradient(
geom_density_ridges_gradient(aes(fill = ..x..)
, scale = 3
, size = 0.3 ) +
facet_wrap( ~lineage
, scales = "free"
#, switch = 'x'
, labeller = labeller(lineage = my_labels) ) +
coord_cartesian( xlim = c(-1, 1)
#, ylim = c(0, 6)
#, clip = "off"
) +
scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
, name = "DUET" ) +
theme(axis.text.x = element_text(size = my_ats
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_blank()
, axis.title.x = element_blank()
, axis.title.y = element_blank()
, axis.ticks.y = element_blank()
, plot.title = element_blank()
, strip.text = element_text(size = my_als)
, legend.text = element_text(size = 10)
, legend.title = element_text(size = my_als)
#, legend.position = "none"
)
print(p2)
#dev.off()
########################################################################
#==============
# combine plot
#===============
svg(plot_lineage_dist_combined, width = 12, height = 6)
printFile = cowplot::plot_grid(p1, p2
, label_size = my_als+10)
print(printFile)
dev.off()

View file

@ -0,0 +1,96 @@
#!/usr/bin/env Rscript
#########################################################
# TASK: To write muts with opposite effects on
# protomer and ligand stability
#########################################################
# working dir and loading libraries
getwd()
setwd("~/git/LSHTM_analysis/scripts/plotting/")
getwd()
source("plotting_data.R")
# should return the following dfs, directories and variables
# my_df
# my_df_u
# my_df_u_lig
# dup_muts
cat(paste0("Directories imported:"
, "\ndatadir:", datadir
, "\nindir:", indir
, "\noutdir:", outdir
, "\nplotdir:", plotdir))
cat(paste0("Variables imported:"
, "\ndrug:", drug
, "\ngene:", gene
, "\ngene_match:", gene_match
, "\nLength of upos:", length(upos)
, "\nAngstrom symbol:", angstroms_symbol))
# clear excess variable
rm(my_df, upos, dup_muts)
#========================================================
#===========
# input
#===========
#in_file1: output of plotting_data.R
# my_df_u
#===========
# output
#===========
# mutations with opposite effects
out_filename_opp_muts = paste0(tolower(gene), "_muts_opp_effects.csv")
outfile_opp_muts = paste0(outdir, "/", out_filename_opp_muts)
#%%===============================================================
# spelling Correction 1: DUET incase American spelling needed!
table(my_df_u$duet_outcome); sum(table(my_df_u$duet_outcome) )
#my_df_u$duet_outcome[my_df_u$duet_outcome=="Stabilising"] <- "Stabilizing"
#my_df_u$duet_outcome[my_df_u$duet_outcome=="Destabilising"] <- "Destabilizing"
# spelling Correction 2: Ligand incase American spelling needed!
table(my_df_u$ligand_outcome); sum(table(my_df_u$ligand_outcome) )
#my_df_u$ligand_outcome[my_df_u$ligand_outcome=="Stabilising"] <- "Stabilizing"
#my_df_u$ligand_outcome[my_df_u$ligand_outcome=="Destabilising"] <- "Destabilizing"
# muts with opposing effects on protomer and ligand stability
table(my_df_u$duet_outcome != my_df_u$ligand_outcome)
changes = my_df_u[which(my_df_u$duet_outcome != my_df_u$ligand_outcome),]
# sanity check: redundant, but uber cautious!
dl_i = which(my_df_u$duet_outcome != my_df_u$ligand_outcome)
ld_i = which(my_df_u$ligand_outcome != my_df_u$duet_outcome)
cat("Identifying muts with opposite stability effects")
if(nrow(changes) == (table(my_df_u$duet_outcome != my_df_u$ligand_outcome)[[2]]) & identical(dl_i,ld_i)) {
cat("PASS: muts with opposite effects on stability and affinity correctly identified"
, "\nNo. of such muts: ", nrow(changes))
}else {
cat("FAIL: unsuccessful in extracting muts with changed stability effects")
}
#==========================
# write file: changed muts
#==========================
write.csv(changes, outfile_opp_muts)
cat("Finished writing file for muts with opp effects:"
, "\nFilename: ", outfile_opp_muts
, "\nDim:", dim(changes))
# clear variables
rm(out_filename_opp_muts, outfile_opp_muts)
rm(changes, dl_i, ld_i)
# count na in each column
na_count = sapply(my_df_u, function(y) sum(length(which(is.na(y))))); na_count
df_ncols = ncol(my_df_u)
#===================================== end of script

View file

@ -0,0 +1,192 @@
#!/usr/bin/env Rscript
#########################################################
# TASK: Bubble plot of OR for PS and Lig
# Output: 1 svg
#=======================================================================
# working dir and loading libraries
getwd()
setwd("~/git/LSHTM_analysis/scripts/plotting/")
getwd()
source("Header_TT.R")
require(cowplot)
source("combining_dfs_plotting.R")
# should return the following dfs, directories and variables
# PS combined:
# 1) merged_df2
# 2) merged_df2_comp
# 3) merged_df3
# 4) merged_df3_comp
# LIG combined:
# 5) merged_df2_lig
# 6) merged_df2_comp_lig
# 7) merged_df3_lig
# 8) merged_df3_comp_lig
# 9) my_df_u
# 10) my_df_u_lig
cat(paste0("Directories imported:"
, "\ndatadir:", datadir
, "\nindir:", indir
, "\noutdir:", outdir
, "\nplotdir:", plotdir))
cat(paste0("Variables imported:"
, "\ndrug:", drug
, "\ngene:", gene
, "\ngene_match:", gene_match
, "\nAngstrom symbol:", angstroms_symbol
, "\nNo. of duplicated muts:", dup_muts_nu
, "\nNA count for ORs:", na_count
, "\nNA count in df2:", na_count_df2
, "\nNA count in df3:", na_count_df3))
#=========================
#=======
# output
#=======
or_combined = "or_combined_PS_LIG.svg"
plot_or_combined = paste0(plotdir,"/", or_combined)
#or_kin_combined = "or_kin_combined_PS_LIG.svg"
#plot_or_kin_combined = paste0(plotdir,"/", or_kin_combined)
#=======================================================================
###########################
# Data for OR and stability plots
# you need merged_df3_comp
# since these are matched
# to allow pairwise corr
###########################
ps_df = merged_df3_comp
lig_df = merged_df3_comp_lig
# Ensure correct data type in columns to plot: should be TRUE
is.numeric(ps_df$or_mychisq)
is.numeric(lig_df$or_mychisq)
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig, my_df_u, my_df_u_lig)
#%% end of section 1
# sanity check: should be <10
if (max(lig_df$ligand_distance) < 10){
print ("Sanity check passed: lig data is <10Ang")
}else{
print ("Error: data should be filtered to be within 10Ang")
}
#############
# Plots: Bubble plot
# x = Position, Y = stability
# size of dots = OR
# col: stability
#############
#-----------------
# Plot 1: DUET vs OR by position as geom_points
#-------------------
my_ats = 20 # axis text size
my_als = 22 # axis label size
# Spelling Correction: made redundant as already corrected at the source
#ps_df$duet_outcome[ps_df$duet_outcome=='Stabilizing'] <- 'Stabilising'
#ps_df$duet_outcome[ps_df$duet_outcome=='Destabilizing'] <- 'Destabilising'
table(ps_df$duet_outcome) ; sum(table(ps_df$duet_outcome))
g1 = ggplot(ps_df, aes(x = factor(position)
, y = duet_scaled))
p1 = g1 +
geom_point(aes(col = duet_outcome
, size = or_mychisq))+
#, size = or_kin)) + # not good, almost like log(or)
theme(axis.text.x = element_text(size = my_ats
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = my_ats
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_als)
, axis.title.y = element_text(size = my_als)
, legend.text = element_text(size = my_als)
, legend.title = element_text(size = my_als) ) +
#, legend.key.size = unit(1, "cm")) +
labs(title = ""
, x = "Position"
, y = "DUET(PS)"
, size = "Odds Ratio"
, colour = "DUET Outcome") +
guides(colour = guide_legend(override.aes = list(size=4)))
p1
#-------------------
# generate plot 2: Lig vs OR by position as geom_points
#-------------------
# Spelling Correction: made redundant as already corrected at the source
#lig_df$ligand_outcome[lig_df$ligand_outcome=='Stabilizing'] <- 'Stabilising'
#lig_df$ligand_outcome[lig_df$ligand_outcome=='Destabilizing'] <- 'Destabilising'
table(lig_df$ligand_outcome)
g2 = ggplot(lig_df, aes(x = factor(position)
, y = affinity_scaled))
p2 = g2 +
geom_point(aes(col = ligand_outcome
, size = or_mychisq))+
#, size = or_kin)) + # not good! almost like log(or)
theme(axis.text.x = element_text(size = my_ats
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = my_ats
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_als)
, axis.title.y = element_text(size = my_als)
, legend.text = element_text(size = my_als)
, legend.title = element_text(size = my_als) ) +
#, legend.key.size = unit(1, "cm")) +
labs(title = ""
, x = "Position"
, y = "Ligand Affinity"
, size = "Odds Ratio"
, colour = "Ligand Outcome"
) +
guides(colour = guide_legend(override.aes = list(size=4)))
p2
#======================
# combine using cowplot
#======================
svg(plot_or_combined, width = 32, height = 12)
#svg(plot_or_kin_combined, width = 32, height = 12)
theme_set(theme_gray()) # to preserve default theme
printFile = cowplot::plot_grid(plot_grid(p1, p2
, ncol = 1
, align = 'v'
, labels = c("", "")
, label_size = my_als+5))
print(printFile)
dev.off()

View file

@ -15,7 +15,10 @@ library(ggplot2)
library(data.table) library(data.table)
library(dplyr) library(dplyr)
source("dirs.R")
require("getopt", quietly = TRUE) #cmd parse arguments require("getopt", quietly = TRUE) #cmd parse arguments
#======================================================== #========================================================
# command line args # command line args
#spec = matrix(c( #spec = matrix(c(
@ -31,19 +34,6 @@ require("getopt", quietly = TRUE) #cmd parse arguments
# stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)") # stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
#} #}
#======================================================== #========================================================
#%% variable assignment: input and output paths & filenames
drug = "pyrazinamide"
gene = "pncA"
gene_match = paste0(gene,"_p.")
cat(gene_match)
#=============
# directories
#=============
datadir = paste0("~/git/Data")
indir = paste0(datadir, "/", drug, "/input")
outdir = paste0("~/git/Data", "/", drug, "/output")
plotdir = paste0("~/git/Data", "/", drug, "/output/plots")
#====== #======
# input # input
#====== #======
@ -52,6 +42,17 @@ in_filename_params = paste0(tolower(gene), "_all_params.csv")
infile_params = paste0(outdir, "/", in_filename_params) infile_params = paste0(outdir, "/", in_filename_params)
cat(paste0("Input file 1:", infile_params) ) cat(paste0("Input file 1:", infile_params) )
cat('columns based on variables:\n'
, drug
, '\n'
, dr_muts_col
, '\n'
, other_muts_col
, "\n"
, resistance_col
, '\n===============================================================')
#%%=============================================================== #%%===============================================================
########################### ###########################
# Read file: struct params # Read file: struct params
@ -62,9 +63,6 @@ my_df = read.csv(infile_params, header = T)
cat("\nInput dimensions:", dim(my_df)) cat("\nInput dimensions:", dim(my_df))
# quick checks
#colnames(my_df)
#str(my_df)
########################### ###########################
# extract unique mutation entries # extract unique mutation entries

View file

@ -0,0 +1,251 @@
#!/usr/bin/env Rscript
#########################################################
# TASK: AF, OR and stability plots: PS
# Output: 1 SVG
#########################################################
# Installing and loading required packages
##########################################################
getwd()
setwd("~/git/LSHTM_analysis/scripts/plotting/")
getwd()
source("Header_TT.R")
library(ggplot2)
library(data.table)
#source("combining_dfs_plotting.R")
source("barplot_colour_function.R")
source("subcols_axis.R")
###########################
# Data for PS plots
# you need merged_df3_comp/merged_df_comp
# since these have unique SNPs
###########################
no_or_index = which(is.na(my_df_u_cols$or_mychisq))
my_df_u_cols_clean = my_df_u_cols[-no_or_index,]
#%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
df = my_df_u_cols_clean
#%%%%%%%%%%%%%%%%%%%%%%%%%
cols_to_select = colnames(mut_pos_cols)
mut_pos_cols_clean = df[colnames(df)%in%cols_to_select]
mut_pos_cols_clean = unique(mut_pos_cols_clean[, 1:length(mut_pos_cols_clean)])
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#===========================
# Barplot with axis colours:
#===========================
#================
# Inspecting mut_pos_cols
# position numbers and colours and assigning axis colours based on lab_fg
# of the correct df
# open file from desktop ("sample_axis_cols") for cross checking
#================
# very important!
#my_axis_colours = mut_pos_cols$lab_fg
if (nrow(mut_pos_cols_clean) == length(unique(df$position)) ){
print("PASS: lengths checked, assigning axis colours")
my_axis_colours = mut_pos_cols_clean$lab_fg
cat("length of axis colours:", length(my_axis_colours)
, "\nwhich corresponds to the number of positions on the x-axis of the plot")
}else{
print("FAIL:lengths mismatch, could not assign axis colours")
quit()
}
# unique positions
upos = unique(df$position); length(upos)
table(df$duet_outcome)
# add frequency of positions (from lib data.table)
setDT(df)[, pos_count := .N, by = .(position)]
# this is cummulative
table(df$pos_count)
# use group by on this
library(dplyr)
snpsBYpos_df <- df %>%
group_by(position) %>%
summarize(snpsBYpos = mean(pos_count))
table(snpsBYpos_df$snpsBYpos)
snp_count = sort(unique(snpsBYpos_df$snpsBYpos))
# sanity checks
# should be a factor
if (is.factor(df$duet_outcome)){
print("duet_outcome is factor")
}else{
print("converting duet_outcome to factor")
df$duet_outcome = as.factor(df$duet_outcome)
}
is.factor(df$duet_outcome)
# should be -1 and 1
min(df$duet_scaled)
max(df$duet_scaled)
# sanity checks
tapply(df$duet_scaled, df$duet_outcome, min)
tapply(df$duet_scaled, df$duet_outcome, max)
# My colour FUNCTION: based on group and subgroup
# in my case;
# df = df
# group = duet_outcome
# subgroup = normalised score i.e duet_scaled
# check unique values in normalised data
u = unique(df$duet_scaled)
cat("No. of unique values in normalised data:", length(u))
# Define group
# Create an extra column called group which contains the "gp name and score"
# so colours can be generated for each unique values in this column
my_grp = df$duet_scaled # no rounding
df$group <- paste0(df$duet_outcome, "_", my_grp, sep = "")
# Call the function to create the palette based on the group defined above
colours <- ColourPalleteMulti(df, "duet_outcome", "my_grp")
print(paste0("Colour palette generated for: ", length(colours), " colours"))
my_title = "Protein stability (DUET)"
cat("No. of axis colours: ", length(my_axis_colours))
#******************
# generate plot: barplot unordered by frequency with axis colours
#******************
class(df$lab_bg)
# define cartesian coord
my_xlim = length(unique(df$position)); my_xlim
# axis label size
my_xals = 18
my_yals = 18
# axes text size
my_xats = 14
my_yats = 18
g3 = ggplot(df, aes(factor(position, ordered = T)))
p3 = g3 +
coord_cartesian(xlim = c(1, my_xlim)
#, ylim = c(0, 6)
, ylim = c(0, max(snp_count))
, clip = "off") +
geom_bar(aes(fill = group), colour = "grey") +
scale_fill_manual(values = colours
, guide = "none") +
geom_tile(aes(,-0.8, width = 0.95, height = 0.85)
, fill = df$lab_bg) +
geom_tile(aes(,-1.2, width = 0.95, height = -0.2)
, fill = df$lab_bg2) +
# Here it"s important to specify that your axis goes from 1 to max number of levels
theme(axis.text.x = element_text(size = my_xats
, angle = 90
, hjust = 1
, vjust = 0.4
, colour = my_axis_colours)
, axis.text.y = element_text(size = my_yats
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_xals)
, axis.title.y = element_text(size = my_yals+2 )
, axis.ticks.x = element_blank()) +
labs(title = ""
#title = my_title
, x = "Position"
, y = "Frequency")
p3
#=================
# generate plot: AF by position unordered, not coloured
#=================
my_ats = 20 # axis text size
my_als = 22 # axis label size
g1 = ggplot(df)
p1 = g1 +
geom_bar(aes(x = factor(position, ordered = T)
, y = af*100
#, fill = duet_outcome
)
, color = "black"
, fill = "lightgrey"
, stat = "identity") +
theme(axis.text.x = element_blank()
, axis.text.y = element_text(size = my_ats
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_blank()
, axis.title.y = element_text(size = my_als) ) +
labs(title = ""
#, size = 100
#, x = "Wild-type position"
, y = "AF(%)")
p1
#=================
# generate plot: OR by position unordered
#=================
my_ats = 20 # axis text size
my_als = 22 # axis label size
g2 = ggplot(df)
p2 = g2 +
geom_bar(aes(x = factor(position, ordered = T)
, y = or_mychisq
#, fill = duet_outcome
)
, colour = "black"
, fill = "lightgray"
, stat = "identity") +
theme(axis.text.x = element_blank()
, axis.text.y = element_text(size = my_ats
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_blank()
, axis.title.y = element_text(size = my_als) ) +
labs(#title = "OR by position"
#, x = "Wild-type position"
y = "OR")
p2
########################################################################
# end of DUET barplots #
########################################################################
#=======
# output
#=======
ps_plots_combined = "af_or_combined_PS.svg"
plot_ps_plots_combined = paste0(plotdir,"/", ps_plots_combined )
svg(plot_ps_plots_combined , width = 26, height = 12)
#theme_set(theme_gray()) # to preserve default theme
printFile = cowplot::plot_grid(p1, p2, p3
, ncol = 1
, align = 'v')
printFile
dev.off()

View file

@ -0,0 +1,136 @@
#!/usr/bin/env Rscript
#########################################################
# TASK: To resolve ambiguous muts present in <gene>_metadata.csv
#(which is one of the outputs from python script)
# Input csv file: <gene>_metadata.csv
# Output csv file: <gene>_metadata_clean.csv
#########################################################
#=======================================================================
# working dir and loading libraries
getwd()
setwd("~/git/LSHTM_analysis/scripts/plotting/")
getwd()
source("dirs.R")
cat("Directories imported:"
, "\n===================="
, "\ndatadir:", datadir
, "\nindir:", indir
, "\noutdir:", outdir
, "\nplotdir:", plotdir)
cat("Variables imported:"
, "\n====================="
, "\ndrug:", drug
, "\ngene:", gene
, "\ngene_match:", gene_match
, "\ndr_muts_col:", dr_muts_col
, "\nother_muts_col:", other_muts_col
, "\ndrtype_col:", resistance_col)
#========================================================
#===========
# input
#===========
# infile1: gene associated meta data
gene_metadata_raw = paste0(tolower(gene), "_metadata_raw.csv")
infile_gene_metadata_raw = paste0(outdir, "/", gene_metadata_raw)
cat("Input infile 1:", infile_gene_metadata_raw)
# infile 2: ambiguous muts
ambiguous_muts = paste0(tolower(gene), "_ambiguous_mut_names.csv")
infile_ambiguous_muts = paste0(outdir, "/", ambiguous_muts)
cat("Input infile 2:", infile_ambiguous_muts)
#===========
# output
#===========
# clean gene_metadat with ambiguous muts resolved
gene_metadata_clean = paste0(tolower(gene), "_metadata.csv")
outfile_gene_metadata_clean = paste0(outdir, "/", gene_metadata_clean)
cat("Output file:", outfile_gene_metadata_clean)
#%%===============================================================
###########################
# Read file: <gene>_meta data raw.csv
###########################
cat("Reading meta data file:", infile_gene_metadata_raw)
gene_metadata_raw <- read.csv(infile_gene_metadata_raw
, stringsAsFactors = F
, header = T)
cat("Dim:", dim(gene_metadata_raw))
###########################
# Read file: ambiguous muts.csv
##########################
ambiguous_muts = read.csv(infile_ambiguous_muts)
ambiguous_muts_names = ambiguous_muts$mutation
common_muts_all = gene_metadata_raw[gene_metadata_raw$mutation%in%ambiguous_muts_names,]
#==============
# resolving ambiguous muts
#===============
table(gene_metadata_raw$mutation_info)
count_check = as.data.frame(cbind(table(gene_metadata_raw$mutationinformation, gene_metadata_raw$mutation_info)))
count_check$checks = ifelse(count_check[[dr_muts_col]]&count_check[[other_muts_col]]>0
, "ambi", "pass")
table(count_check$checks)
cat(table(count_check$checks)[["ambi"]], "ambiguous muts detected. Correcting these")
# remove all ambiguous muts from df
ids = gene_metadata_raw$mutation%in%common_muts_all$mutation; table(ids)
gene_metadata_raw_unambiguous = gene_metadata_raw[!ids,]
# sanity checks: should be true
table(gene_metadata_raw_unambiguous$mutation%in%common_muts_all$mutation)[[1]] == nrow(gene_metadata_raw_unambiguous)
nrow(gene_metadata_raw_unambiguous) + nrow(common_muts_all) == nrow(gene_metadata_raw)
# check before resolving ambiguous muts
table(common_muts_all$mutation_info)
common_muts_all$mutation_info = as.factor(common_muts_all$mutation_info)
# resolving ambiguous muts: change other_muts to dr_muts
common_muts_all$mutation_info[common_muts_all$mutation_info==other_muts_col] <- dr_muts_col
table(common_muts_all$mutation_info)
common_muts_all$mutation_info = factor(common_muts_all$mutation_info)
table(common_muts_all$mutation_info)
common_muts_all$mutation_info = as.character(common_muts_all$mutation_info)
# combining to a clean df
gene_metadata = rbind(gene_metadata_raw_unambiguous, common_muts_all)
all(dim(gene_metadata) == dim(gene_metadata))
table(gene_metadata$mutation_info)
count_check2 = as.data.frame(cbind(table(gene_metadata$mutationinformation
, gene_metadata$mutation_info)))
count_check2$checks = ifelse(count_check2[[dr_muts_col]]&count_check2[[other_muts_col]]>0
, "ambi", "pass")
if (table(count_check2$checks) == nrow(count_check2)){
cat("PASS: ambiguous muts successfully resolved."
, "\nwriting file")
}else{
print("FAIL: ambiguous muts could not be resolved!")
quit()
}
#============
# writing file
#=============
write.csv(gene_metadata, outfile_gene_metadata_clean
, row.names = FALSE)
#=====================================================================
# End of script
#======================================================================

View file

@ -1,12 +1,26 @@
#======== #========
# plotting # plotting
#======== #========
FIXME:
#=======================
dirs.R
#=======================
#dirs.R: imports dir structure
change this to cmd so you can pass in drug and gene name
#======================= #=======================
plotting_data.R plotting_data.R
#======================= #=======================
??? update how to run ??? update how to run
#=======================
combining_dfs_plotting.R
#=======================
??? update how to run
#======================= #=======================
mcsm_mean_stability.R mcsm_mean_stability.R
#======================= #=======================
@ -58,6 +72,13 @@ barplots_subcolours_PS.R
# output plots: 1 svg # output plots: 1 svg
1) barplot_coloured_PS.svg 1) barplot_coloured_PS.svg
#=======================
lineage_dist_PS.R
# lineage distribution for all muts and dr muts in one figure
#=======================
# input: calls the "combining_dfs_plotting.R"
# output plots: 1 svg
1) lineage_dist_combined_PS.svg
######################################################################## ########################################################################

96
scripts/plotting/test.R Normal file
View file

@ -0,0 +1,96 @@
setwd("/home/tanu/git/LSHTM_analysis/scripts/plotting")
source("combining_dfs_plotting.R")
table(merged_df3$mutation_info)
# assign foldx
#ddg<0 = "Stabilising" (-ve)
table(merged_df3$ddg < 0)
merged_df3$foldx_outcome = ifelse(merged_df3$ddg < 0, "Stabilising", "Destabilising")
#===========
# PS data
#===========
dr_muts = merged_df3[merged_df3$mutation_info == "dr_mutations_pyrazinamide",]
other_muts = merged_df3[merged_df3$mutation_info == "other_mutations_pyrazinamide",]
par(mfrow = c(1,1))
par(mfrow = c(2,6))
# mcsm duet
boxplot(dr_muts$duet_scaled, other_muts$duet_scaled, main = "DUET"
#, col = factor(merged_df3$duet_outcome)
)
wilcox.test(dr_muts$duet_scaled, other_muts$duet_scaled, paired = F)
# foldx ddg
boxplot(dr_muts$ddg, other_muts$ddg, main = "Foldx")
wilcox.test(dr_muts$ddg, other_muts$ddg, paired = F)
# rd
boxplot(dr_muts$rd_values, other_muts$rd_values, main = "RD")
wilcox.test(dr_muts$rd_values, other_muts$rd_values)
# kd
boxplot(dr_muts$kd_values, other_muts$kd_values, main = "KD")
wilcox.test(dr_muts$kd_values, other_muts$kd_values)
# asa
boxplot(dr_muts$asa, other_muts$asa, main = "ASA")
wilcox.test(dr_muts$asa, other_muts$asa)
# rsa
boxplot(dr_muts$rsa, other_muts$rsa, main = "RSA")
wilcox.test(dr_muts$rsa, other_muts$rsa)
#===================================================================
#==========
# LIG data
#==========
dr_muts_lig = merged_df3_lig[merged_df3_lig$mutation_info == "dr_mutations_pyrazinamide",]
other_muts_lig = merged_df3_lig[merged_df3_lig$mutation_info == "other_mutations_pyrazinamide",]
# mcsm ligand affinity
boxplot(dr_muts_lig$duet_scaled, other_muts_lig$duet_scaled, main = "Ligand affinity")
wilcox.test(dr_muts_lig$duet_scaled, other_muts_lig$duet_scaled, paired = F)
# rd
boxplot(dr_muts_lig$rd_values, other_muts_lig$rd_values, main = "RD")
wilcox.test(dr_muts_lig$rd_values, other_muts_lig$rd_values)
# kd
boxplot(dr_muts_lig$kd_values, other_muts_lig$kd_values, main = "KD")
wilcox.test(dr_muts_lig$kd_values, other_muts_lig$kd_values)
# asa
boxplot(dr_muts_lig$asa, other_muts_lig$asa, main = "ASA")
wilcox.test(dr_muts_lig$asa, other_muts_lig$asa)
# rsa
boxplot(dr_muts_lig$rsa, other_muts_lig$rsa, main = "RSA")
wilcox.test(dr_muts_lig$rsa, other_muts_lig$rsa)
# checking agreement b/w mcsm and foldx
cols_to_select = c("mutationinformation"
, "mutation_info"
, "duet_scaled"
, "ddg"
, "duet_outcome"
, "foldx_outcome")
merged_df3_short = select(merged_df3, cols_to_select)
mcsm_foldx = merged_df3_short[which(merged_df3_short$duet_outcome != merged_df3_short$foldx_outcome),]
mcsm_foldx$sign_comp = ifelse(sign(mcsm_foldx$duet_scaled)==sign(mcsm_foldx$ddg), "PASS", "FAIL")
table(mcsm_foldx$sign_comp)
# another way of checking
merged_df3$sign_comp = ifelse(sign(merged_df3$duet_scaled)==sign(merged_df3$ddg), "PASS", "FAIL")
table(merged_df3$sign_comp)
disagreement = table(merged_df3$sign_comp)[2]/nrow(merged_df3)*100
agreement = 100 - disagreement
cat("There is", agreement, "% between mcsm and foldx predictions")