From f290d8ec9ec8c234a4b316e5bf5546d491658c52 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 25 Aug 2022 22:21:05 +0100 Subject: [PATCH] addded appendix tables and frequent muts --- scripts/.swp | Bin 0 -> 12288 bytes scripts/DE_CHECK_DEL | 1590 +++++++++++++++++ .../gid/gid_ORandSNP_results.R | 204 +++ .../plotting_thesis/gid/gid_appendix_tables.R | 426 +++++ .../plotting_thesis/gid/gid_ks_test_all_PS.R | 545 ++++++ .../plotting/plotting_thesis/ks_test_all_PS.R | 552 ++++++ .../rpob/rpob_appendix_tables.R | 482 +++++ scripts/rgb_hex.R | 33 + 8 files changed, 3832 insertions(+) create mode 100644 scripts/.swp create mode 100644 scripts/DE_CHECK_DEL create mode 100644 scripts/plotting/plotting_thesis/gid/gid_ORandSNP_results.R create mode 100644 scripts/plotting/plotting_thesis/gid/gid_appendix_tables.R create mode 100755 scripts/plotting/plotting_thesis/gid/gid_ks_test_all_PS.R create mode 100755 scripts/plotting/plotting_thesis/ks_test_all_PS.R create mode 100644 scripts/plotting/plotting_thesis/rpob/rpob_appendix_tables.R create mode 100644 scripts/rgb_hex.R diff --git a/scripts/.swp b/scripts/.swp new file mode 100644 index 0000000000000000000000000000000000000000..47bc07535235558148fed777bd05f03a2473f997 GIT binary patch literal 12288 zcmeI%PfEi;6bA6u?gXu(7l?Lap@d1&OeSkrf`1lL*QQoQkffBjcIzQLghz1U0bF_p z@eqDPW+J-i&KBVtcyD+wFPY)DE1jtG!{fk&v0!c4)U4jihSWuQ@9RbTtSE2GYS9Fr zQGf!g7g!r7air`$`g^;<&h~iqSKrP3Q&Lo6rcbFC_sTF3S3rq)qHw&9UPpVoQeEJDKfvA z?5j|XhAJH8NNF^1z1Wu8rY!Y6ouV`vDObK{?7MlBnqCwtHEffg%Z;6jZPGt_nSEfs Yd+l9x=I3;Rk;tB#xL*5t^wNAFUnAgNS^xk5 literal 0 HcmV?d00001 diff --git a/scripts/DE_CHECK_DEL b/scripts/DE_CHECK_DEL new file mode 100644 index 0000000..1af8c68 --- /dev/null +++ b/scripts/DE_CHECK_DEL @@ -0,0 +1,1590 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Jun 28 14:15:40 2022 + +@author: tanu +""" +############################################################################### +homedir = os.path.expanduser("~") +sys.path.append(homedir + '/git/LSHTM_analysis/scripts/') + +from reference_dict import my_aa_dict # CHECK DIR STRUC THERE! +from tidy_split import tidy_split +############################################################################### +#%% creating required variables: gene and drug dependent, and input filename +gene_match = gene + '_p.' +print('mut pattern for gene', gene, ':', gene_match) + +nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}' +print('nsSNP for gene', gene, ':', nssnp_match) + +nssnp_match_captureG = "(" + nssnp_match + ")" + +wt_regex = gene_match.lower()+'([A-Za-z]{3})' +print('wt regex:', wt_regex) + +mut_regex = r'[0-9]+(\w{3})$' +print('mt regex:', mut_regex) + +pos_regex = r'([0-9]+)' +print('position regex:', pos_regex) + +# building cols to extract +dr_muts_col = 'dr_mutations_' + drug +other_muts_col = 'other_mutations_' + drug +resistance_col = 'drtype' + +print('Extracting columns based on variables:\n' + , drug + , '\n' + , dr_muts_col + , '\n' + , other_muts_col + , '\n' + , resistance_col + , '\n===============================================================') +#======= +# input +#======= +#in_filename_master_master = 'original_tanushree_data_v2.csv' #19k +in_filename_master = 'mtb_gwas_meta_v6.csv' #35k +infile_master = datadir + '/' + in_filename_master +print('Input file: ', infile_master + , '\n============================================================') + +#======= +# output +#======= +# several output files: in respective sections at the time of outputting files +print('Output filename: in the respective sections' + , '\nOutput path: ', outdir + , '\n=============================================================') + +#end of variable assignment for input and output files +############################################################################### +#%% Read input file +master_data = pd.read_csv(infile_master, sep = ',') + +# column names +#list(master_data.columns) + +# extract elevant columns to extract from meta data related to the drug +if in_filename_master == 'original_tanushree_data_v2.csv': + meta_data = master_data[['id' + , 'country' + , 'lineage' + , 'sublineage' + , 'drtype' + , drug + , dr_muts_col + , other_muts_col]] +else: + core_cols = ['id' + , 'sample' + #, 'patient_id' + #, 'strain' + , 'lineage' + , 'sublineage' + , 'country_code' + #, 'geographic_source' + , resistance_col] + + variable_based_cols = [drug + , dr_muts_col + , other_muts_col] + + cols_to_extract = core_cols + variable_based_cols + print('Extracting', len(cols_to_extract), 'columns from master data') + + meta_data = master_data[cols_to_extract] + del(master_data, variable_based_cols, cols_to_extract) + +print('Extracted meta data from filename:', in_filename_master + , '\nDim:', meta_data.shape) + +# checks and results +total_samples = meta_data['id'].nunique() +print('RESULT: Total samples:', total_samples + , '\n===========================================================') + +# counts NA per column +meta_data.isna().sum() +print('No. of NAs/column:' + '\n', meta_data.isna().sum() + , '\n===========================================================\n') +############################################################################### +#%% Quick checks: +meta_data['lineage'].value_counts() +meta_data['lineage'].value_counts().sum() +meta_data['lineage'].nunique() + +# replace lineage with 'L' in lineage_labels +#meta_data['lineage_labels'] = meta_data['lineage'] +#meta_data['lineage_labels'].equals(meta_data['lineage']) +#all(meta_data['lineage_labels'].value_counts() == meta_data['lineage'].value_counts()) +#meta_data['lineage_labels'] = meta_data['lineage_labels'].str.replace('lineage', 'L') +#meta_data['lineage'].value_counts() +#meta_data['lineage_labels'].value_counts() + +meta_data['lineage'] = meta_data['lineage'].str.replace('lineage', 'L') +meta_data['lineage'].value_counts() +print("\n================================" + , "\nLineage numbers" + , "\nComplete lineage samples:", meta_data['lineage'].value_counts().sum() + , "\nMissing lineage samples:", meta_data['id'].nunique() - meta_data['lineage'].value_counts().sum() + , "\n================================") + +meta_data['id'].nunique() +meta_data['sample'].nunique() +meta_data['id'].equals(meta_data['sample']) +foo = meta_data.copy() +foo['Diff'] = np.where( foo['id'] == foo['sample'] , '1', '0') +foo['Diff'].value_counts() + +meta_data['drtype'].value_counts() +meta_data['drtype'].value_counts().sum() +print("\n================================" + , "\ndrtype numbers" + , "\nComplete drtype samples:", meta_data['drtype'].value_counts().sum() + , "\nMissing drtype samples:", meta_data['id'].nunique() - meta_data['drtype'].value_counts().sum() + , "\n================================") + +meta_data['drug_name'] = meta_data[drug].map({1:'R' + , 0:'S'}) +meta_data['drug_name'].value_counts() +meta_data[drug].value_counts() +meta_data[drug].value_counts().sum() +print("\n================================" + , "\ndrug", drug, "numbers" + , "\nComplete drug samples:", meta_data[drug].value_counts().sum() + , "\nMissing drug samples:", meta_data['id'].nunique() - meta_data[drug].value_counts().sum() + , "\n================================") + +print("\n================================" + , "\ndrug", drug, "numbers" + , "\nComplete drug samples:", meta_data['drug_name'].value_counts().sum() + , "\nMissing drug samples:", meta_data['id'].nunique() - meta_data['drug_name'].value_counts().sum() + , "\n================================") + +#%% Quick counts: All samples, drug: +print('===========================================================\n' + , 'RESULT: Total no. of samples tested for', drug, ':', meta_data[drug].value_counts().sum() + , '\n===========================================================\n' + , 'RESULT: No. of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts() + , '\n===========================================================\n' + , 'RESULT: Percentage of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100 + , '\n===========================================================') + +print('===========================================================\n' + , 'RESULT: Total no. of samples tested for', drug, ':', meta_data['drug_name'].value_counts().sum() + , '\n===========================================================\n' + , 'RESULT: No. of Sus and Res', drug, 'samples:\n', meta_data['drug_name'].value_counts() + , '\n===========================================================\n' + , 'RESULT: Percentage of Sus and Res', drug, 'samples:\n', meta_data['drug_name'].value_counts(normalize = True)*100 + , '\n===========================================================') + +############################################################################## +#%% Extracting nsSNP for gene (meta_gene_all): from dr_muts col and other_muts_col +#meta_data_gene = meta_data.loc[ (meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) | (meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) ] +# so downstream code doesn't change +meta_gene_all = meta_data.loc[ (meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) | (meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) ] +#%% DF: with dr_muts_col, meta_gene_dr +meta_gene_dr = meta_gene_all.loc[meta_gene_all[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)] +meta_gene_dr1 = meta_data.loc[meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)] + +if meta_gene_dr.equals(meta_gene_dr1): + print('\nPASS: DF with column:', dr_muts_col, 'extracted successfully' + , '\ngene_snp_match in column:',dr_muts_col, meta_gene_dr.shape) +else: + sys.exit('\nFAIL: DF with column:', dr_muts_col,'could not be extracted' + , '\nshape of df1:', meta_gene_dr.shape + , '\nshape of df2:', meta_gene_dr1.shape + , '\nCheck again!') +############################################################################## +#%% DF: with other_muts_col, other_gene_dr +meta_gene_other = meta_gene_all.loc[meta_gene_all[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)] +meta_gene_other1 = meta_data.loc[meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)] + +print('gene_snp_match in dr:', len(meta_gene_other)) +meta_gene_other.equals(meta_gene_other1) + +if meta_gene_other.equals(meta_gene_other1): + print('\nPASS: DF with column:', other_muts_col,'extracted successfully' + , '\ngene_snp_match in column:',other_muts_col, meta_gene_other.shape) +else: + sys.exit('\nFAIL: DF with column:', other_muts_col,'could not be extracted' + , '\nshape of df1:', meta_gene_other.shape + , '\nshape of df2:', meta_gene_other1.shape + , '\nCheck again!') +############################################################################## +#%% Quick counts: nsSNP samples, drug +meta_gene_all[drug].value_counts() + +print('===========================================================\n' + , 'RESULT: Total no. of samples for', drug, 'with nsSNP mutations:', meta_gene_all['id'].nunique() + , '\n===========================================================\n' + + , '===========================================================\n' + , 'RESULT: Total no. of samples tested for', drug, 'with nsSNP mutations:', meta_gene_all[drug].value_counts().sum() + , '\n===========================================================\n' + + , '===========================================================\n' + , 'RESULT: Total no. of samples tested for', drug, 'with nsSNP mutations:', meta_gene_all['drug_name'].value_counts().sum() + + , '\n===========================================================\n' + , 'RESULT: No. of Sus and Res', drug, 'samples with nsSNP:\n', meta_gene_all['drug_name'].value_counts() + + , '\n===========================================================\n' + , 'RESULT: Percentage of Sus and Res', drug, 'samples with nsSNP mutations:\n', meta_gene_all['drug_name'].value_counts(normalize = True)*100 + , '\n===========================================================') +############################################################################### +#%% Create a copy of indices for downstream mergeing +#meta_gene_all['index_orig'] = meta_gene_all.index +#meta_gene_all['index_orig_copy'] = meta_gene_all.index + +#all(meta_gene_all.index.values == meta_gene_all['index_orig'].values) +#all(meta_gene_all.index.values == meta_gene_all['index_orig_copy'].values) +############################################################################## +#%% Important sanity checks: Dr muts column for tidy split(), nsSNPs, etc. +# Split based on semi colon dr_muts_col +search = ";" + +# count of occurrence of ";" in dr_muts_col: No.of semicolons + 1 is no. of rows created * occurence +count_df_dr = meta_gene_dr[['id', dr_muts_col]].copy() +count_df_dr['dr_semicolon_count'] = meta_gene_dr.loc[:, dr_muts_col].str.count(search, re.I) +dr_sc_C = count_df_dr['dr_semicolon_count'].value_counts().reset_index() +dr_sc_C + +dr_sc_C['index_semicolon'] = (dr_sc_C['index'] + 1) *dr_sc_C['dr_semicolon_count'] +dr_sc_C +expected_drC = dr_sc_C['index_semicolon'].sum() +expected_drC + +# count no. of nsSNPs and extract those nsSNPs +count_df_dr['dr_geneSNP_count'] = meta_gene_dr.loc[:, dr_muts_col].str.count(nssnp_match, re.I) +dr_gene_count1 = count_df_dr['dr_geneSNP_count'].sum() + +############################################################################### +# This is to find out how many samples have 1 and more than 1 mutation,so you +# can use it to check if your data extraction process for dr_muts +# and other_muts has worked correctly AND also to check the dim of the +# final formatted data. +# This will have: unique COMBINATION of sample id and mutations +#======== +# First: counting mutations in dr_muts_col column +#======== +print('Now counting WT &', nssnp_match, 'muts within the column:', dr_muts_col) +# drop na and extract a clean df +clean_df = meta_data.dropna(subset=[dr_muts_col]) + +# sanity check: count na +na_count = meta_data[dr_muts_col].isna().sum() + +if len(clean_df) == (total_samples - na_count): + print('PASS: clean_df extracted: length is', len(clean_df) + , '\nNo.of NAs in', dr_muts_col, '=', na_count, '/', total_samples + , '\n==========================================================') +else: + sys.exit('FAIL: Could not drop NAs') + +dr_gene_count = 0 +wt = 0 +id_dr = [] +id2_dr = [] +dr_gene_mutsL = [] +#nssnp_match_regex = re.compile(nssnp_match) + +for i, id in enumerate(clean_df.id): + #print (i, id) + #id_dr.append(id) + #count_gene_dr = clean_df[dr_muts_col].iloc[i].count(gene_match) # can include stop muts + count_gene_dr = len(re.findall(nssnp_match, clean_df[dr_muts_col].iloc[i], re.IGNORECASE)) + gene_drL = re.findall(nssnp_match, clean_df[dr_muts_col].iloc[i], re.IGNORECASE) + #print(count_gene_dr) + if count_gene_dr > 0: + id_dr.append(id) + if count_gene_dr > 1: + id2_dr.append(id) + #print(id, count_gene_dr) + dr_gene_count = dr_gene_count + count_gene_dr + dr_gene_mutsL = dr_gene_mutsL + gene_drL + count_wt = clean_df[dr_muts_col].iloc[i].count('WT') + wt = wt + count_wt + +print('RESULTS:') +print('Total WT in dr_muts_col:', wt) +print('Total matches of', gene, 'SNP matches in', dr_muts_col, ':', dr_gene_count) +print('Total samples with > 1', gene, 'nsSNPs in dr_muts_col:', len(id2_dr) ) +print('Total matches of UNIQUE', gene, 'SNP matches in', dr_muts_col, ':', len(set(dr_gene_mutsL))) +print('=================================================================') + +if dr_gene_count == dr_gene_count1: + print('\nPass: dr gene SNP count match') +else: + sys.exit('\nFAIL: dr gene SNP count MISmatch') + +del(clean_df, na_count, i, id, wt, id2_dr, count_gene_dr, count_wt) + +#%% tidy_split(): dr_muts_col +#========= +# DF1: dr_muts_col +# and remove leading white spaces +#========= +#col_to_split1 = dr_muts_col +print ('Firstly, applying tidy split on dr muts df', meta_gene_dr.shape + , '\ncolumn name to apply tidy_split():' , dr_muts_col + , '\n============================================================') + +# apply tidy_split() +dr_WF0 = tidy_split(meta_gene_dr, dr_muts_col, sep = ';') +# remove leading white space else these are counted as distinct mutations as well +dr_WF0[dr_muts_col] = dr_WF0[dr_muts_col].str.strip() + +if len(dr_WF0) == expected_drC: + print('\nPass: tidy split on', dr_muts_col, 'worked' + , '\nExpected nrows:', expected_drC + , '\nGot nrows:', len(dr_WF0) ) +else: + print('\nFAIL: tidy split on', dr_muts_col, 'did not work' + , '\nExpected nrows:', expected_drC + , '\nGot nrows:', len(dr_WF0) ) + +# Extract only the samples/rows with nssnp_match +#dr_gene_WF0 = dr_WF0.loc[dr_WF0[dr_muts_col].str.contains(gene_match)] +dr_gene_WF0 = dr_WF0.loc[dr_WF0[dr_muts_col].str.contains(nssnp_match, regex = True, case = False)] +#dr_gene_WF0_v2 = dr_WF0.loc[dr_WF0[dr_muts_col].str.contains(nssnp_match_captureG, regex = True, case = False)] + +print('Lengths after tidy split and extracting', nssnp_match, 'muts:' + , '\nOld length:' , len(meta_gene_dr) + , '\nLength after split:', len(dr_WF0) + , '\nLength of nssnp df:', len(dr_gene_WF0) + , '\nExpected len:', dr_gene_count + , '\n=============================================================') + +# Important: Assign 'column name' on which split was performed as an extra column +# This is so you can identify if mutations are dr_type or other in the final df +dr_df = dr_gene_WF0.assign(mutation_info = dr_muts_col) +print('Dim of dr_df:', dr_df.shape + , '\n==============================================================' + , '\nEnd of tidy split() on dr_muts, and added an extra column relecting mut_category' + , '\n===============================================================') + +if other_muts_col in dr_df.columns: + print('Dropping:', other_muts_col, 'from WF gene dr_df') + dr_df = dr_df.drop([other_muts_col], axis = 1) +######################################################################## +#%% Important sanity checks: other muts column for tidy split(), nsSNPs, etc. +# Split based on semi colon on other_muts_col +# count of occurrence of ";" in other_muts_col: No.of semicolons + 1 is no. of rows created * occurence +count_df_other = meta_gene_other[['id', other_muts_col]].copy() +count_df_other['other_semicolon_count'] = meta_gene_other.loc[:, other_muts_col].str.count(search, re.I) +other_sc_C = count_df_other['other_semicolon_count'].value_counts().reset_index() +other_sc_C +other_sc_C['index_semicolon'] = (other_sc_C['index']+1)*other_sc_C['other_semicolon_count'] +other_sc_C +expected_otherC = other_sc_C['index_semicolon'].sum() +expected_otherC + +# count no. of nsSNPs and extract those nsSNPs +count_df_other['other_geneSNP_count'] = meta_gene_other.loc[:, other_muts_col].str.count(nssnp_match, re.I) +other_gene_count1 = count_df_other['other_geneSNP_count'].sum() + +# This is to find out how many samples have 1 and more than 1 mutation,so you +# can use it to check if your data extraction process for dr_muts +# and other_muts has worked correctly AND also to check the dim of the +# final formatted data. +# This will have: unique COMBINATION of sample id and mutations + +#======== +# Second: counting mutations in other_muts_col column +#======== +print('Now counting WT &', nssnp_match, 'muts within the column:', other_muts_col) +# drop na and extract a clean df +clean_df = meta_data.dropna(subset=[other_muts_col]) + +# sanity check: count na +na_count = meta_data[other_muts_col].isna().sum() + +if len(clean_df) == (total_samples - na_count): + print('PASS: clean_df extracted: length is', len(clean_df) + , '\nNo.of NAs =', na_count, '/', total_samples + , '\n=========================================================') +else: + sys.exit('FAIL: Could not drop NAs') + +other_gene_count = 0 +wt_other = 0 +id_other = [] +id2_other = [] +other_gene_mutsL = [] + +for i, id in enumerate(clean_df.id): + #print (i, id) + #id_other.append(id) + #count_gene_other = clean_df[other_muts_col].iloc[i].count(gene_match) # can include stop muts + count_gene_other = len(re.findall(nssnp_match, clean_df[other_muts_col].iloc[i], re.IGNORECASE)) + gene_otherL = re.findall(nssnp_match, clean_df[other_muts_col].iloc[i], re.IGNORECASE) + #print(count_gene_other) + if count_gene_other > 0: + id_other.append(id) + if count_gene_other > 1: + id2_other.append(id) + #print(id, count_gene_other) + other_gene_count = other_gene_count + count_gene_other + other_gene_mutsL = other_gene_mutsL + gene_otherL + count_wt = clean_df[other_muts_col].iloc[i].count('WT') + wt_other = wt_other + count_wt + +print('RESULTS:') +print('Total WT in other_muts_col:', wt_other) +print('Total matches of', gene, 'SNP matches in', other_muts_col, ':', other_gene_count) +print('Total samples with > 1', gene, 'nsSNPs in other_muts_col:', len(id2_other) ) +print('Total matches of UNIQUE', gene, 'SNP matches in', other_muts_col, ':', len(set(other_gene_mutsL))) +print('=================================================================') + +if other_gene_count == other_gene_count1: + print('\nPass: other gene SNP count match') +else: + sys.exit('\nFAIL: other gene SNP count MISmatch') + +del(clean_df, na_count, i, id, wt_other, id2_other, count_gene_other, count_wt ) +#%% tidy_split(): other_muts_col +#========= +# DF2: other_muts_col +# and remove leading white spaces +#========= +#col_to_split2 = other_muts_col +print ('applying second tidy split() separately on other muts df', meta_gene_other.shape + , '\ncolumn name to apply tidy_split():', other_muts_col + , '\n============================================================') + +# apply tidy_split() +other_WF1 = tidy_split(meta_gene_other, other_muts_col, sep = ';') +# remove the leading white spaces in the column +other_WF1[other_muts_col] = other_WF1[other_muts_col].str.strip() + +# extract only the samples/rows with nssnp_match +#other_gene_WF1 = other_WF1.loc[other_WF1[other_muts_col].str.contains(gene_match)] +other_gene_WF1 = other_WF1.loc[other_WF1[other_muts_col].str.contains(nssnp_match, regex = True, case = False)] + +print('Lengths after tidy split and extracting', gene_match, 'muts:', + '\nOld length:' , len(meta_gene_other), + '\nLength after split:', len(other_WF1), + '\nLength of nssnp df:', len(other_gene_WF1), + '\nExpected len:', other_gene_count + , '\n=============================================================') + +# Important: Assign 'column name' on which split was performed as an extra column +# This is so you can identify if mutations are dr_type or other in the final df +other_df = other_gene_WF1.assign(mutation_info = other_muts_col) +print('dim of other_df:', other_df.shape + , '\n===============================================================' + , '\nEnd of tidy split() on other_muts, and added an extra column relecting mut_category' + , '\n===============================================================') + +if dr_muts_col in other_df.columns: + print('Dropping:', dr_muts_col, 'from WF gene other_df') + other_df = other_df.drop([dr_muts_col], axis = 1) +############################################################################### +#%% Finding ambiguous muts +common_snps_dr_other = set(dr_gene_mutsL).intersection(set(other_gene_mutsL)) + +#%% More sanity checks: expected unique snps and nrows in LF data +expected_unique_snps = len(set(dr_gene_mutsL)) + len(set(other_gene_mutsL)) - len(common_snps_dr_other) +expected_rows = dr_gene_count + other_gene_count + +#%% Useful results to note==> counting dr, other and common muts +print('\n===================================================================' + , '\nCount unique nsSNPs for', gene, ':' + , expected_unique_snps + , '\n===================================================================') + +Vcounts_dr = pd.Series(dr_gene_mutsL).value_counts() +Vcounts_common_dr = Vcounts_dr.get(list(common_snps_dr_other)) +print('\n===================================================================' + , "\nCount of samples for common muts in dr muts\n" + , Vcounts_common_dr + , '\n===================================================================') + +Vcounts_other = pd.Series(other_gene_mutsL).value_counts() +Vcounts_common_other = Vcounts_other.get(list(common_snps_dr_other)) + +print('\n===================================================================' + , '\nCount of samples for common muts in other muts\n' + , Vcounts_common_other + , '\n===================================================================') + +print('\n===================================================================' + , '\nPredicting total no. of rows in the curated df:', expected_rows + , '\n===================================================================') + +#%% another way: Add value checks for dict so you can know if its correct for LF data count below +dr_snps_vc_dict = pd.Series(dr_gene_mutsL).value_counts().to_dict() +other_snps_vc_dict = pd.Series(other_gene_mutsL).value_counts().to_dict() + +for k, v in dr_snps_vc_dict.items(): + if k in common_snps_dr_other: + print(k,v) + +for k, v in other_snps_vc_dict.items(): + if k in common_snps_dr_other: + print(k,v) + +# using defaultdict +Cdict = collections.defaultdict(int) + +# iterating key, val with chain() +for key, val in itertools.chain(dr_snps_vc_dict.items(), other_snps_vc_dict.items()): + if key in common_snps_dr_other: + Cdict[key] += val + else: + Cdict[key] = val + +print(dict(Cdict)) + +for k, v in Cdict.items(): + if k in common_snps_dr_other: + print(k,v) + +# convert defaultDict to dict +SnpFDict_orig = dict(Cdict) + +def lower_dict(d): + new_dict = dict((k.lower(), v) for k, v in d.items()) + return new_dict + +SnpFDict = lower_dict(SnpFDict_orig) +############################################################################### +# USE Vcounts to get expected curated df +# resolve dm om muts funda +#%% Concatenating two dfs: gene_LF0 +# equivalent of rbind in R +#========== +# Concatentating the two dfs: equivalent of rbind in R +#========== +# Important: Change column names to allow concat: +# dr_muts.. & other_muts : 'mutation' +print('Now concatenating the two dfs by row' + , '\nFirst assigning a common colname: "mutation" to the col containing muts' + , '\nThis is done for both dfs' + , '\n===================================================================') + +dr_df.columns +dr_df.rename(columns = {dr_muts_col: 'mutation'}, inplace = True) +dr_df.columns + +other_df.columns +other_df.rename(columns = {other_muts_col: 'mutation'}, inplace = True) +other_df.columns + +if len(dr_df.columns) == len(other_df.columns): + print('Checking dfs for concatening by rows:' + , '\nDim of dr_df:', dr_df.shape + , '\nDim of other_df:', other_df.shape + , '\nExpected nrows:', len(dr_df) + len(other_df) + , '\n=============================================================') +else: + sys.exit('FAIL: No. of cols mismatch for concatenating') + +# checking colnames before concat +print('Checking colnames BEFORE concatenating the two dfs...') +if (set(dr_df.columns) == set(other_df.columns)): + print('PASS: column names match necessary for merging two dfs') +else: + sys.exit('FAIL: Colnames mismatch for concatenating!') + +# concatenate (axis = 0): Rbind, adn keeo original index +print('Now appending the two dfs: Rbind') +gene_LF_comb = pd.concat([dr_df, other_df] + #, ignore_index = True + , axis = 0) + +if gene_LF_comb.index.nunique() == len(meta_gene_all.index): + print('\nPASS:length of index in LF data:', len(gene_LF_comb.index) + , '\nLength of unique index in LF data:', gene_LF_comb.index.nunique() + ) +else: + sys.exit('\nFAIL: expected length for combined LF data mismatch:' + , '\nExpected length:', len(meta_gene_all.index) + , '\nGot:', gene_LF_comb.index.nunique() ) + +print('Finding stop mutations in concatenated df') +stop_muts = gene_LF_comb['mutation'].str.contains('\*').sum() +if stop_muts == 0: + print('PASS: No stop mutations detected') +else: + print('stop mutations detected' + , '\nNo. of stop muts:', stop_muts, '\n' + , gene_LF_comb.groupby(['mutation_info'])['mutation'].apply(lambda x: x[x.str.contains('\*')].count()) + , '\nNow removing them') + +gene_LF0_nssnp = gene_LF_comb[~gene_LF_comb['mutation'].str.contains('\*')] +print('snps only: by subtracting stop muts:', len(gene_LF0_nssnp)) + +gene_LF0 = gene_LF_comb[gene_LF_comb['mutation'].str.contains(nssnp_match, case = False)] +print('snps only by direct regex:', len(gene_LF0)) + +if gene_LF0_nssnp.equals(gene_LF0): + print('PASS: regex for extracting nssnp worked correctly & stop mutations successfully removed' + , '\nUsing the regex extracted df') +else: + sys.exit('FAIL: posssibly regex or no of stop mutations' + , 'Regex being used:', nssnp_match) + #sys.exit() + +# checking colnames and length after concat +print('Checking colnames AFTER concatenating the two dfs...') +if (set(dr_df.columns) == set(gene_LF0.columns)): + print('PASS: column names match' + , '\n=============================================================') +else: + sys.exit('FAIL: Colnames mismatch AFTER concatenating') + +print('Checking concatenated df') +if len(gene_LF0) == (len(dr_df) + len(other_df))- stop_muts: + print('PASS:length of df after concat match' + , '\n===============================================================') +else: + sys.exit('FAIL: length mismatch') + +#%% NEW check2:id, sample, etc. +gene_LF0['id'].count() +gene_LF0['id'].nunique() +gene_LF0['sample'].nunique() +gene_LF0['mutation_info'].value_counts() +gene_LF0[drug].isna().sum() +#%% Concatenating two dfs sanity checks: gene_LF1 +#========================================================= +# This is hopefully clean data, but just double check +# Filter LF data so that you only have +# mutations corresponding to nssnp_match (case insensitive) +# this will be your list you run OR calcs +#========================================================= +print('Length of gene_LF0:', len(gene_LF0) + , '\nThis should be what we need. But just double checking and extracting nsSNP for', gene + , '\nfrom LF0 (concatenated data) using case insensitive regex match:', nssnp_match) + +#gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(nssnp_match, regex = True, case = False)] +gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(nssnp_match, regex = True, case = False)].copy() + +if len(gene_LF0) == len(gene_LF1): + print('PASS: length of gene_LF0 and gene_LF1 match', + '\nConfirming extraction and concatenating worked correctly' + , '\n==========================================================') +else: + print('FAIL: BUT NOT FATAL!' + , '\ngene_LF0 and gene_LF1 lengths differ' + , '\nsuggesting error in extraction process' + , ' use gene_LF1 for downstreama analysis' + , '\n=========================================================') + +print('Length of dfs pre and post processing...' + , '\ngene WF data (unique samples in each row):',len(meta_gene_all) + , '\ngene LF data (unique mutation in each row):', len(gene_LF1) + , '\n=============================================================') + +# sanity check for extraction +# This ought to pass if nsnsp_match happens right at the beginning of creating 'expected_rows' +print('Verifying whether extraction process worked correctly...') +if len(gene_LF1) == expected_rows: + print('PASS: extraction process performed correctly' + , '\nExpected length:', expected_rows + , '\nGot:', len(gene_LF1) + , '\nRESULT: Total no. of mutant sequences for logo plot:', len(gene_LF1) + , '\n=========================================================') +else: + print('FAIL: extraction process has bugs' + , '\nExpected length:', expected_rows + , '\nGot:', len(gene_LF1) + , '\nDebug please' + , '\n=========================================================') + +# more sanity checks +print('Performing some more sanity checks...') + +# From LF1: useful for OR counts +# no. of unique muts +distinct_muts = gene_LF1.mutation.value_counts() +print('Distinct genomic mutations:', len(distinct_muts)) + +# no. of samples contributing the unique muts +gene_LF1.id.nunique() +print('No.of samples contributing to distinct genomic muts:', gene_LF1.id.nunique()) + +# no. of dr and other muts +mut_grouped = gene_LF1.groupby('mutation_info').mutation.nunique() +print('No.of unique dr and other muts:\n', gene_LF1.groupby('mutation_info').mutation.nunique()) + +# sanity check +if len(distinct_muts) == mut_grouped.sum() : + print('PASS:length of LF1 is as expected' + , '\n===============================================================') +else: + print('FAIL: Mistmatch in count of muts' + , '\nExpected count:', len(distinct_muts) + , '\nActual count:', mut_grouped.sum() + , '\nMuts should be distinct within dr* and other* type' + , '\nInspecting...possibly ambiguous muts' + , '\nNo. of possible ambiguous muts:', mut_grouped.sum() - len(distinct_muts) + , '\n=========================================================') + +muts_split = list(gene_LF1.groupby('mutation_info')) +dr_muts = muts_split[0][1].mutation +other_muts = muts_split[1][1].mutation +print('splitting muts by mut_info:', muts_split) +print('no.of dr_muts samples:', len(dr_muts)) +print('no. of other_muts samples', len(other_muts)) +#%% Ambiguous muts: the same mutation cannot be classed as a drug AND 'others' +if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: + print('WARNING: Ambiguous muts detected in dr_ and other_ mutation category' + , '\n===============================================================') +else: + print('PASS: NO ambiguous muts detected' + , '\nMuts are unique to dr_ and other_ mutation class' + , '\n=========================================================') + +# inspect dr_muts and other muts: Fixed in case no ambiguous muts detected! +if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: + print('Finding ambiguous muts...' + , '\n=========================================================' + , '\nTotal no. of samples in dr_muts present in other_muts:', dr_muts.isin(other_muts).sum() + , '\nThese are:', dr_muts[dr_muts.isin(other_muts)] + , '\n=========================================================' + , '\nTotal no. of samples in other_muts present in dr_muts:', other_muts.isin(dr_muts).sum() + , '\nThese are:\n', other_muts[other_muts.isin(dr_muts)] + , '\n=========================================================') + + print('Counting no. of ambiguous muts...' + , '\nNo. of ambiguous muts in dr:' + , len(dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist()) + , '\nNo. of ambiguous muts in other:' + , len(other_muts[other_muts.isin(dr_muts)].value_counts().keys().tolist()) + , '\n=========================================================') + + if dr_muts[dr_muts.isin(other_muts)].nunique() == other_muts[other_muts.isin(dr_muts)].nunique(): + common_muts = dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist() + print('Distinct no. of ambigiuous muts detected:'+ str(len(common_muts)) + , '\nlist of ambiguous mutations (see below):', *common_muts, sep = '\n') + print('\n===========================================================') +else: + #sys.exit('Error: ambiguous muts present, but extraction failed. Debug!') + print('Ambiguous muts are NOT present') + +#%% DOES NOT depend on common_muts +gene_LF1_orig = gene_LF1.copy() +gene_LF1_orig.equals(gene_LF1) + +# copy the old columns for checking +gene_LF1['mutation_info_orig'] = gene_LF1['mutation_info'] +gene_LF1['mutation_info_v1'] = gene_LF1['mutation_info'] +gene_LF1['mutation_info'].value_counts() + +#%% Ambiguous muts: revised annotation for mutation_info +if 'common_muts' in globals(): + ambiguous_muts_df = gene_LF1[gene_LF1['mutation'].isin(common_muts)] + ambiguous_muts_value_counts = ambiguous_muts_df.groupby('mutation')['mutation_info'].value_counts() + ambiguous_muts_value_counts + + # gene_LF1_orig = gene_LF1.copy() + # gene_LF1_orig.equals(gene_LF1) + + # # copy the old columns for checking + # gene_LF1['mutation_info_orig'] = gene_LF1['mutation_info'] + # gene_LF1['mutation_info_v1'] = gene_LF1['mutation_info'] + # gene_LF1['mutation_info'].value_counts() + #%% Inspect ambiguous muts + #===================================== + # Now create a new df that will have: + # ambiguous muts + # mutation_info + # revised mutation_info + # The revised label is based on value_counts + # for mutaiton_info. The corresponding mutation_info: + # label is chosen that corresponds to the max of value counts + #===================================== + ambig_muts_rev_df = pd.DataFrame() + changes_val = [] + changes_dict = {} + + common_muts + gene_LF1['mutation'].head() + #common_muts_lower = list((map(lambda x: x.lower(), common_muts))) + #common_muts_lower + + for i in common_muts: + #for i in common_muts_lower: + #print(i) + temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']] + temp_df + # DANGER: ASSUMES TWO STATES ONLY and that value_counts sorts by descending + max_info_table = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']].value_counts() + max_info_table + revised_label = max_info_table[[0]].index[0][1] # max value_count + old_label = max_info_table[[1]].index[0][1] # min value_count + print('\nAmbiguous mutation labels...' + , '\nSetting mutation_info for', i, 'to', revised_label) + temp_df['mutation_info_REV'] = np.where( (temp_df['mutation_info_orig'] == old_label) + , revised_label + , temp_df['mutation_info_orig']) + ambig_muts_rev_df = pd.concat([ambig_muts_rev_df,temp_df]) + f = max_info_table[[1]] + old_label_count = f[[0]][0] + changes_val.append(old_label_count) + cc_dict = f.to_dict() + changes_dict.update(cc_dict) + + ambig_muts_rev_df['mutation_info_REV'].value_counts() + ambig_muts_rev_df['mutation_info_orig'].value_counts() + changes_val + changes_total = sum(changes_val) + changes_dict + n_changes = sum(changes_dict.values()) + #%% OUTFILE 1, write file: ambiguous muts and ambiguous mut counts + #================== + # ambiguous muts + #================== + #dr_muts.XXX_csvXXXX('dr_muts.csv', header = True) + #other_muts.XXXX_csvXXX('other_muts.csv', header = True) + if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: + out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv' + outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts + print('\n----------------------------------' + , '\nWriting file: ambiguous muts' + , '\n----------------------------------' + , '\nFilename:', outfile_ambig_muts) + inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)] + inspect.to_csv(outfile_ambig_muts, index = True) + + print('Finished writing:', out_filename_ambig_muts + , '\nNo. of rows:', len(inspect) + , '\nNo. of cols:', len(inspect.columns) + , '\nNo. of rows = no. of samples with the ambiguous muts present:' + , dr_muts.isin(other_muts).sum() + other_muts.isin(dr_muts).sum() + , '\n=============================================================') + del(out_filename_ambig_muts) + + #%% OUTFILE 2, write file: ambiguous mut counts + #====================== + # ambiguous mut counts + #====================== + out_filename_ambig_mut_counts = gene.lower() + '_ambiguous_mut_counts.csv' + outfile_ambig_mut_counts = outdir + '/' + out_filename_ambig_mut_counts + print('\n----------------------------------' + , '\nWriting file: ambiguous mut counts' + , '\n----------------------------------' + , '\nFilename:', outfile_ambig_mut_counts) + + ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True) + #%% FIXME: Add sanity check to make sure you can add value_count checks + #%% Resolving ambiguous muts: Merging ambiguous muts with the revised annotations + #================= + # Merge ambig muts + # with gene_LF1 + #=================== + ambig_muts_rev_df.index + gene_LF1.index + all(ambig_muts_rev_df.index.isin(gene_LF1.index)) + + any(gene_LF1.index.isin(ambig_muts_rev_df.index)) + # if(gene_LF1.index.unique().isin(ambig_muts_rev_df.index).sum() == len(ambig_muts_rev_df)): + if(gene_LF1.index.unique().isin(ambig_muts_rev_df.index).sum() == ambig_muts_rev_df.index.nunique()): + + print('\nPASS: ambiguous mut indices present in gene_LF1. Prepare to merge...') + else: + sys.exit('\nFAIL:ambiguous mut indices MISmatch. Check section Resolving ambiguous muts') + + ########################################################################## + for index, row in ambig_muts_rev_df.iterrows(): + curr_mut = row['mutation'] + curr_rev = row['mutation_info_REV'] + print('\n=====\nAmbiguous Mutation: index:', index, '\nmutation:', curr_mut, '\nNew:', curr_rev, '\n=====\n' ) + print('\n-----\nReplacing original: index:', index, '\nmutation: ' + , gene_LF1.loc[index,'mutation'] + , '\nmutation_info to replace:' + , gene_LF1.loc[index,'mutation_info'] + , '\nwith:', curr_rev, '\n-----') + replacement_row=(gene_LF1.index==index) & (gene_LF1['mutation'] == curr_mut) + gene_LF1.loc[replacement_row, 'mutation_info'] = curr_rev + + ########################################################################### + + gene_LF1['mutation_info_orig'].value_counts() + gene_LF1['mutation_info_v1'].value_counts() + gene_LF1['mutation_info'].value_counts() + + # Sanity check1: if there are still any ambiguous muts + #muts_split_rev = list(gene_LF1.groupby('mutation_info_v1')) + muts_split_rev = list(gene_LF1.groupby('mutation_info')) + dr_muts_rev = muts_split_rev[0][1].mutation + other_muts_rev = muts_split_rev[1][1].mutation + print('splitting muts by mut_info:', muts_split_rev) + print('no.of dr_muts samples:', len(dr_muts_rev)) + print('no. of other_muts samples', len(other_muts_rev)) + + if not dr_muts_rev.isin(other_muts_rev).sum() & other_muts_rev.isin(dr_muts_rev).sum() > 0: + print('\nAmbiguous muts corrected. Proceeding with downstream analysis') + else: + print('\nAmbiguous muts NOT corrected. Quitting!') + sys.exit() +else: + print('Mutations ARE NOT ambiguous, proceeding to downstream analyses') + +#gene_LF1['mutation_info_v1'].value_counts() +gene_LF1['mutation_info'].value_counts() + +# reassign +#%% PHEW! Good to go for downstream stuff +#%% Add column: Mutationinformation ==> gene_LF1 +# splitting mutation column to get mCSM style muts +#===================================================== +# Formatting df: read aa dict and pull relevant info +#===================================================== +print('Now some more formatting:' + , '\nRead aa dict and pull relevant info' + , '\nFormat mutations:' + , '\nsplit mutation into mCSM style muts: ' + , '\nFormatting mutation in mCSM style format: {WT}{MUT}' + , '\nAssign aa properties: adding 2 cols at a time for each prop' + , '\n===================================================================') + +# BEWARE hardcoding : only works as we are adding aa prop once for wt and once for mut +# in each lookup cycle +ncol_mutf_add = 3 # mut split into 3 cols +ncol_aa_add = 2 # 2 aa prop add (wt & mut) in each mapping + +#====================================================================== +# Split 'mutation' column into three: wild_type, position and +# mutant_type separately. Then map three letter code to one using +# reference_dict imported at the beginning. +# After importing, convert to mutation to lowercase for +# compatibility with dict +#======================================================================= +gene_LF1['mutation'] = gene_LF1.loc[:, 'mutation'].str.lower() + +print('wt regex being used:', wt_regex + , '\nmut regex being used:', mut_regex + , '\nposition regex being used:', pos_regex) + +mylen0 = len(gene_LF1.columns) + +#========================================================= +# Iterate through the dict, create a lookup dict i.e +# lookup_dict = {three_letter_code: one_letter_code}. +# lookup dict should be the key and the value (you want to create a column for) +# Then use this to perform the mapping separetly for wild type and mutant cols. +# The three letter code is extracted using a string match match from the +# dataframe and then converted +# to 'pandas series'since map only works in pandas series +#=========================================================== +print('Adding', ncol_mutf_add, 'more cols:\n') + +# initialise a sub dict that is lookup dict for three letter code to 1-letter code +# adding three more cols +lookup_dict = dict() +for k, v in my_aa_dict.items(): + lookup_dict[k] = v['one_letter_code'] + #wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze()converts to a series that map works on + wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze() + gene_LF1['wild_type'] = wt.map(lookup_dict) + #mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze() + mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze() + gene_LF1['mutant_type'] = mut.map(lookup_dict) + +#------------------- +# extract position +# info from mutation +# column separetly using string match +#------------------- +#gene_LF1['position'] = gene_LF1['mutation'].str.extract(r'(\d+)') +gene_LF1['position'] = gene_LF1['mutation'].str.extract(pos_regex) + +#mylen1 = len(gene_LF1.columns) +mylen0_v2 = len(gene_LF1.columns) + +# sanity checks +print('checking if 3-letter wt&mut residue extraction worked correctly') +if wt.isna().sum() & mut.isna().sum() == 0: + print('PASS: 3-letter wt & mut residue extraction worked correctly:' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) +else: + print('FAIL: 3-letter wt&mut residue extraction failed' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) + +#if mylen1 == mylen0 + ncol_mutf_add: +if mylen0_v2 == mylen0 + ncol_mutf_add: + print('PASS: successfully added', ncol_mutf_add, 'cols' + , '\nold length:', mylen0 + , '\nnew len:', mylen0_v2) +else: + print('FAIL: failed to add cols:' + , '\nold length:', mylen0 + , '\nnew len:', mylen0_v2) + +# clear variables +del(k, v, wt, mut, lookup_dict) + +########################################################################## +# combine the wild_type+poistion+mutant_type columns to generate +# mutationinformation (matches mCSM output field) +# Remember to use .map(str) for int col types to allow string concatenation +########################################################################### +gene_LF1['mutationinformation'] = gene_LF1['wild_type'] + gene_LF1.position.map(str) + gene_LF1['mutant_type'] +print('Created column: mutationinformation' + , '\n=====================================================================\n' + , gene_LF1.mutationinformation.head(10)) + +# order by position for convenience +gene_LF1.dtypes + +# converting position to numeric +gene_LF1['position'] = pd.to_numeric(gene_LF1['position']) + +# sort by position inplace +foo = gene_LF1['position'].value_counts() +foo = foo.sort_index() + +gene_LF1.sort_values(by = ['position'], inplace = True) +bar = gene_LF1['position'].value_counts() +bar = bar.sort_index() + +if all(foo == bar): + print('PASS: df ordered by position') + print(gene_LF1['position'].head()) +else: + sys.exit('FAIL: df could not be ordered. Check source') + +print('\nDim of gene_LF1:', len(gene_LF1.columns), 'more cols:\n') + +#%% Create a copy of mutationinformation column for downstream mergeing +gene_LF1['Mut'] = gene_LF1['mutationinformation'] +gene_LF1['Mut_copy'] = gene_LF1['mutationinformation'] + +#%% Create a copy of indices for downstream mergeing +gene_LF1['index_orig'] = gene_LF1.index +gene_LF1['index_orig_copy'] = gene_LF1.index + +all(gene_LF1.index.values == gene_LF1['index_orig'].values) +all(gene_LF1.index.values == gene_LF1['index_orig_copy'].values) + +#%% Quick sanity check for position freq count +# count the freq of 'other muts' samples +test_df = gene_LF1.copy() +test_df = test_df[['id','index_orig', 'mutationinformation', 'mutation', 'position']] +# add freq column +#test_df['sample_freq'] = test_df.groupby('id')['id'].transform('count') +#print('Revised dim of other_muts_df:',test_df.shape) +test_df['scount'] = test_df['mutation'].map(SnpFDict) +test_df = test_df.sort_values(['position', 'mutationinformation']) +#%% Map mutation frequency count as a column +gene_LF1['snp_frequency'] = gene_LF1['mutation'].map(SnpFDict) +#%% Map position frequency count as a column +z = gene_LF1['position'].value_counts() +z1 = z.to_dict() +gene_LF1['pos_count'] = gene_LF1['position'].map(z1) + +#test_df2 = test_df.loc[test_df['position'] == 10] +############################################################################### +cols_added = ['Mut', 'Mut_copy', 'index', 'index_copy', 'pos_count', 'snp_frequency'] +print('\nAdded', len(cols_added), 'more cols:\n' + , '\nDim of new gene_LF1:', len(gene_LF1.columns)) +mylen1 = len(gene_LF1.columns) # updated my_len1 +############################################################################### +#%% Add column: aa property_water ==> gene_LF1 +#========= +# iterate through the dict, create a lookup dict that i.e +# lookup_dict = {three_letter_code: aa_prop_water} +# Do this for both wild_type and mutant as above. +#========= +print('Adding', ncol_aa_add, 'more cols:\n') + +# initialise a sub dict that is lookup dict for three letter code to aa prop +# adding two more cols +lookup_dict = dict() +for k, v in my_aa_dict.items(): + lookup_dict[k] = v['aa_prop_water'] + #print(lookup_dict) + wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze() + gene_LF1['wt_prop_water'] = wt.map(lookup_dict) + #mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze() + mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze() + gene_LF1['mut_prop_water'] = mut.map(lookup_dict) + +mylen2 = len(gene_LF1.columns) + +# sanity checks +print('checking if 3-letter wt&mut residue extraction worked correctly') +if wt.isna().sum() & mut.isna().sum() == 0: + print('PASS: 3-letter wt&mut residue extraction worked correctly:' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) +else: + print('FAIL: 3-letter wt&mut residue extraction failed' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) + +if mylen2 == mylen1 + ncol_aa_add: + print('PASS: successfully added', ncol_aa_add, 'cols' + , '\nold length:', mylen1 + , '\nnew len:', mylen2) +else: + print('FAIL: failed to add cols:' + , '\nold length:', mylen1 + , '\nnew len:', mylen2) + +# clear variables +del(k, v, wt, mut, lookup_dict) + +#%% Add column: aa_prop_polarity ==> gene_LF1 +#======== +# iterate through the dict, create a lookup dict that i.e +# lookup_dict = {three_letter_code: aa_prop_polarity} +# Do this for both wild_type and mutant as above. +#========= +print('Adding', ncol_aa_add, 'more cols:\n') + +# initialise a sub dict that is lookup dict for three letter code to aa prop +# adding two more cols +lookup_dict = dict() +for k, v in my_aa_dict.items(): + lookup_dict[k] = v['aa_prop_polarity'] + #print(lookup_dict) + wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze() + gene_LF1['wt_prop_polarity'] = wt.map(lookup_dict) + #mut = gene_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze() + mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze() + gene_LF1['mut_prop_polarity'] = mut.map(lookup_dict) + +mylen3 = len(gene_LF1.columns) + +# sanity checks +print('checking if 3-letter wt&mut residue extraction worked correctly') +if wt.isna().sum() & mut.isna().sum() == 0: + print('PASS: 3-letter wt&mut residue extraction worked correctly:' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) +else: + print('FAIL: 3-letter wt&mut residue extraction failed' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) + +if mylen3 == mylen2 + ncol_aa_add: + print('PASS: successfully added', ncol_aa_add, 'cols' + , '\nold length:', mylen1 + , '\nnew len:', mylen2) +else: + print('FAIL: failed to add cols:' + , '\nold length:', mylen1 + , '\nnew len:', mylen2) + +# clear variables +del(k, v, wt, mut, lookup_dict) +#%% Add column: aa_calcprop ==> gene_LF1 +#======== +# iterate through the dict, create a lookup dict that i.e +# lookup_dict = {three_letter_code: aa_calcprop} +# Do this for both wild_type and mutant as above. +#========= +print('Adding', ncol_aa_add, 'more cols:\n') + +lookup_dict = dict() +for k, v in my_aa_dict.items(): + lookup_dict[k] = v['aa_calcprop'] + #print(lookup_dict) + wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze() + gene_LF1['wt_calcprop'] = wt.map(lookup_dict) + mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze() + gene_LF1['mut_calcprop'] = mut.map(lookup_dict) + +mylen4 = len(gene_LF1.columns) + +# sanity checks +print('checking if 3-letter wt&mut residue extraction worked correctly') +if wt.isna().sum() & mut.isna().sum() == 0: + print('PASS: 3-letter wt&mut residue extraction worked correctly:' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) +else: + print('FAIL: 3-letter wt&mut residue extraction failed' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) + +if mylen4 == mylen3 + ncol_aa_add: + print('PASS: successfully added', ncol_aa_add, 'cols' + , '\nold length:', mylen3 + , '\nnew len:', mylen4) +else: + print('FAIL: failed to add cols:' + , '\nold length:', mylen3 + , '\nnew len:', mylen4) + +# clear variables +del(k, v, wt, mut, lookup_dict) + +#%% NEW mappings: gene_LF2 +# gene_LF2: copy gene_LF1 +gene_LF2 = gene_LF1.copy() +gene_LF2.index + +#%% Add total unique id count +gene_LF2['id'].nunique() +gene_LF2['Mut'].nunique() +total_id_ucount = gene_LF2['id'].nunique() +total_id_ucount +total_id_ucount2 = gene_LF2['sample'].nunique() +total_id_ucount2 + +if total_id_ucount == total_id_ucount2: + print('\nPASS: sample and id unique counts match') +else: + print('\nFAIL: sample and id unique counts DO NOT match!' + , '\nGWAS worry!?') + +# Add all sample ids in a list for sanity checks +#gene_LF2['id_list'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].apply(list)) +#========================================== +# Add column: total unique id/sample count +#========================================== +gene_LF2['total_id_ucount'] = total_id_ucount + +#========================================== +# DELETE as already mapped: Add column: mutation count in all samples +#========================================== +# gene_LF2['mut_id_ucount'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].nunique()) +# gene_LF2['mut_id_ucount'] + +gene_LF2['snp_frequency'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].nunique()) + +# gene_LF1['snp_frequency'].equals(gene_LF2['mut_id_ucount']) + +#%% AF for gene +#================= +# Add column: AF +#================= +gene_LF2['maf'] = gene_LF2['snp_frequency']/gene_LF2['total_id_ucount'] +gene_LF2['maf'].head() +############################################################################### +#%% Further mappings: gene_LF3, with mutationinformation as INDEX +gene_LF3 = gene_LF2.copy() + +# Assign index: mutationinformation for mapping +gene_LF3 = gene_LF3.set_index(['mutationinformation']) +gene_LF3.index +gene_LF3['id'].nunique() +gene_LF3['Mut'].nunique() +gene_LF3.index.nunique() + +all(gene_LF3['Mut'] == gene_LF3.index) + +#%% Mapping 1: column '' +#============================ +# column name: +#============================ +gene_LF3['drtype'].value_counts() + +# mapping 2.1: numeric +drtype_map = {'XDR': 5 + , 'Pre-XDR': 4 + , 'MDR': 3 + , 'Pre-MDR': 2 + , 'Other': 1 + , 'Sensitive': 0} + +gene_LF3['drtype_numeric'] = gene_LF3['drtype'].map(drtype_map) + +gene_LF3['drtype'].value_counts() +gene_LF3['drtype_numeric'].value_counts() + +# Multimode: drtype +#============================= +# Recalculation: Revised drtype +# max(multimode) +#============================= +#-------------------------------- +# drtype: ALL values: +# numeric and names in an array +#-------------------------------- +gene_LF3['drtype_all_vals'] = gene_LF3['drtype_numeric'] +gene_LF3['drtype_all_names'] = gene_LF3['drtype'] + +gene_LF3['drtype_all_vals'] = gene_LF3.groupby('mutationinformation').drtype_all_vals.apply(list) +gene_LF3['drtype_all_vals'].head() + +gene_LF3['drtype_all_names'] = gene_LF3.groupby('mutationinformation').drtype_all_names.apply(list) +gene_LF3['drtype_all_names'].head() + +#--------------------------------- +# Revised drtype: max(Multimode) +#-------------------------------- +gene_LF3['drtype_multimode'] = gene_LF3.groupby('mutationinformation')['drtype_numeric'].agg(multimode) +gene_LF3['drtype_multimode'] + +# Now get the max from multimode +gene_LF3['drtype_mode'] = gene_LF3['drtype_multimode'].apply(lambda x: np.nanmax(x)) +gene_LF3.head() + +#---------------------- +# Revised drtype: Max +#---------------------- +gene_LF3.head() +gene_LF3['drtype_max'] = gene_LF3.groupby('mutationinformation')['drtype_numeric'].max() +gene_LF3.head() + +foo = gene_LF3[['Mut', 'position', 'drtype', 'drtype_multimode', 'drtype_mode', 'drtype_max']] +foo2 = foo.sort_values(['position', 'Mut']) +############################################################################### +#%% Mapping 2: column '', drug + +#======================= +# column name: +#======================= +# mapping 1.1: labels +dm_om_label_map = {dr_muts_col: 'DM' + , other_muts_col: 'OM'} +dm_om_label_map + +gene_LF3['mutation_info_labels'] = gene_LF3['mutation_info'].map(dm_om_label_map) + +# mapping 1.2: numeric +dm_om_num_map = {dr_muts_col: 1 + , other_muts_col: 0} + +gene_LF3['dm_om_numeric'] = gene_LF3['mutation_info'].map(dm_om_num_map) +gene_LF3['dm_om_numeric_orig'] = gene_LF3['mutation_info_orig'].map(dm_om_num_map) + +gene_LF3['mutation_info'].value_counts() +gene_LF3['mutation_info_labels'].value_counts() +gene_LF3['mutation_info_orig'].value_counts() +gene_LF3['dm_om_numeric'].value_counts() +gene_LF3['dm_om_numeric_orig'].value_counts() + +# Check value_counts: column '', mutation_info +gene_LF3['mutation_info'].value_counts() +gene_LF3['mutation_info_v1'].value_counts() +gene_LF3['mutation_info_orig'].value_counts() + +#============================ +# column name: +#============================ +# copy dst column +gene_LF3['dst'] = gene_LF3[drug] # to allow cross checking +gene_LF3['dst'].equals(gene_LF3[drug]) + +# populate dst_multimode +gene_LF3['dst_multimode'] = gene_LF3[drug] + +gene_LF3[drug].isnull().sum() +gene_LF3['dst_multimode'].isnull().sum() + +gene_LF3['dst_multimode'].value_counts() +gene_LF3['dst_multimode'].value_counts().sum() +#%% Multimode: dst +# For each mutation, generate the revised dst which is the mode of dm_om_numeric +#============================= +# Recalculation: Revised dst +# max(multimode) +#============================= +# Get multimode for dm_om_numeric column +#dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric_orig'].agg(multimode) +dm_om_multimode_LF3 = gene_LF3.groupby('mutationinformation')['dm_om_numeric'].agg(multimode) +dm_om_multimode_LF3 +dm_om_multimode_LF3.isnull().sum() + +gene_LF3['dst_multimode_all'] = gene_LF3.groupby('mutationinformation')['dm_om_numeric'].agg(multimode) +gene_LF3['dst_multimode_all'] + +# Fill using multimode ONLY where NA in dst_multimode column +gene_LF3['dst_multimode'] = gene_LF3['dst_multimode'].fillna(dm_om_multimode_LF3) +gene_LF3['dst_multimode'] +gene_LF3['dst_multimode'].value_counts() + +#---------------------------------- +# Revised dst column: max of mode +#---------------------------------- +# Finally created a revised dst with the max from the multimode +# Now get the max from multimode +#gene_LF3['dst_mode'] = gene_LF3.groupby('mutationinformation')['dst_noNA'].max() # this somehow is not right! +#gene_LF3['dst_noNA'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x)) +gene_LF3['dst_mode'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x)) #ML + +#----------------------------------------------------------------------------- +#----------------------------------------------------------------------------- +# NOTE: unexpected weirdness with above, so redoing it! +mmdf = pd.DataFrame(gene_LF3.groupby('mutationinformation')['dst_mode'].agg(multimode)) +mmdf['dst2'] = mmdf['dst_mode'].apply(lambda x: int(max(x))) +mmdf=mmdf.reset_index() + +# rename cols to make sure merge will have the names you expect +mmdf2 = mmdf.rename(columns = {'dst_mode':'dst_multimode', 'dst2':'dst_mode'}) + +# IMPORTANT! +gene_LF3_copy = gene_LF3.copy() +gene_LF3_copy.drop(["dst_mode", "dst_multimode", "dst_multimode_all"], axis = 1, inplace = True) + +# Now merge gene_LF3.copy and mmdf2 +gene_LF3_merged = pd.merge(gene_LF3_copy, mmdf2, on='mutationinformation') +df_check4 = gene_LF3_merged[['mutationinformation', 'dst', 'dst_multimode', 'dst_mode', 'position' ]] + +# now reassign the merged df to gene_LF3 for integration with downstream +gene_LF3 = gene_LF3_merged.copy() +#----------------------------------------------------------------------------- +#----------------------------------------------------------------------------- + +# sanity checks +#gene_LF3['dst_noNA'].equals(gene_LF3['dst_mode']) +gene_LF3[drug].value_counts() +#gene_LF3['dst_noNA'].value_counts() +gene_LF3['dst_mode'].value_counts() + +if (gene_LF3['dst_mode'].value_counts().sum() == len(gene_LF3)) and (gene_LF3['dst_mode'].value_counts()[1] > gene_LF3[drug].value_counts()[1]) and gene_LF3['dst_mode'].value_counts()[0] > gene_LF3[drug].value_counts()[0]: + print('\nPASS: revised dst mode created successfully and the counts are more than col count') +else: + print('\nFAIL: revised dst mode numbers MISmatch') + +#foo = gene_LF3[['Mut', 'position', 'dst', 'dst_multimode', 'dst_noNA', 'dst_mode']] +#foo2 = foo.sort_values(['position', 'Mut']) + +print('\n------------------------------------------------------' + , '\nRevised counting: mutation_info i.e dm om column\n' + , '\n-----------------------------------------------------\n' + + , '\n----------------------------------' + , '\nOriginal drug column count' + , '\n----------------------------------\n' + , gene_LF3[drug].value_counts() + , '\nTotal samples [original]', gene_LF3[drug].value_counts().sum() + + , '\n----------------------------------' + , '\nRevised drug column count\n' + , '\n----------------------------------\n' + , gene_LF3['dst_mode'].value_counts() + , '\nTotal samples [revised]', gene_LF3['dst_mode'].value_counts().sum() + + # , '\n----------------------------------' + # , '\nRevised drug column count: dst_noNA\n' + # , '\n----------------------------------\n' + # , gene_LF3['dst_noNA'].value_counts() + ) +#%% Create revised mutation_info_column based on dst_mode +#--------------------------------------- +# Create revised mutation_info_column +#--------------------------------------- +# Will need to overwrite column 'mutation_info_labels', since downstream depends on it + +# Make a copy you before overwriting +#gene_LF3['mutation_info_labels_v1'] = gene_LF3['mutation_info'].map(dm_om_label_map) +gene_LF3['mutation_info_labels_v1'] = gene_LF3['mutation_info_labels'] +gene_LF3['mutation_info_labels_v1'].value_counts() == gene_LF3['mutation_info_labels'].value_counts() + +# Now overwrite +gene_LF3['mutation_info_labels_dst'] = gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'}) +gene_LF3['mutation_info_labels'] = gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'}) + +if all(gene_LF3['mutation_info_labels_dst'].value_counts() == gene_LF3['mutation_info_labels'].value_counts()): + print('\nPASS: Revised mutation_info column created successfully') + gene_LF3 = gene_LF3.drop(['mutation_info_labels_dst'], axis = 1) +else: + print('\nmutation info labels numbers mismatch' + , '\nPlease check section for mapping dst_mode to labels') + +gene_LF3['mutation_info_orig'].value_counts() +#gene_LF3['mutation_info_labels'].value_counts() +gene_LF3['mutation_info_labels_orig'] = gene_LF3['mutation_info_orig'].map(dm_om_label_map) +gene_LF3['mutation_info_labels_orig'].value_counts() + +#%% FIXME: Get multimode for dm_om_numeric column +#dm_om_multimode_LF4 = gene_LF3.groupby('mutationinformation')['dm_om_numeric_orig'].agg(multimode) +#dm_om_multimode_LF4 +#gene_LF3['dst_multimode_numeric'] = gene_LF3['dst_multimode'].fillna(dm_om_multimode_LF4) + +# %% sanity check for the revised dst +gene_LF3[drug].value_counts() +gene_LF3[drug].value_counts().sum() + +gene_LF3['mutation_info_labels_orig'].value_counts() + +gene_LF3['dst_mode'].value_counts() +gene_LF3['dst_mode'].value_counts().sum() + +# direct comparision +gene_LF3['dst'].value_counts() +gene_LF3['mutation_info_labels'].value_counts() +gene_LF3['mutation_info_labels_v1'].value_counts() + +#%% Lineage +gene_LF3['lineage'].value_counts() +# lineage_label_numeric = {'lineage1' : 1 +# , 'lineage2' : 2 +# , 'lineage3' : 3 +# , 'lineage4' : 4 +# , 'lineage5' : 5 +# , 'lineage6' : 6 +# , 'lineage7' : 7 +# , 'lineageBOV' : 8} + +lineage_label_numeric = {'L1' : 1 + , 'L2' : 2 + , 'L3' : 3 + , 'L4' : 4 + , 'L5' : 5 + , 'L6' : 6 + , 'L7' : 7 + , 'LBOV' : 8} + +lineage_label_numeric +#%% Lineage cols selected: gene_LF3_ColsSel +gene_LF3_ColsSel = gene_LF3.copy() +gene_LF3_ColsSel.index +gene_LF3_ColsSel.columns +#gene_LF3_ColsSel['index_orig_copy'] = gene_LF3_ColsSel['index_orig'] # delete + +# copy column to allow cross checks after stripping white space +gene_LF3_ColsSel['lineage'] = gene_LF3_ColsSel['lineage'].str.strip() +gene_LF3_ColsSel['lineage_corrupt'] = gene_LF3_ColsSel['lineage'] +all(gene_LF3_ColsSel['lineage_corrupt'].value_counts() == gene_LF3_ColsSel['lineage'].value_counts()) +gene_LF3_ColsSel['lineage_corrupt'].value_counts() + +gene_LF3_ColsSel = gene_LF3_ColsSel[['index_orig_copy', 'Mut', 'position', 'snp_frequency', 'lineage', 'lineage_corrupt']] +gene_LF3_ColsSel.columns + +#%% tidy_split(): Lineage +# Create df with tidy_split: lineage +lf_lin_split = tidy_split(gene_LF3_ColsSel, 'lineage_corrupt', sep = ';') +lf_lin_split['lineage_corrupt'] = lf_lin_split['lineage_corrupt'].str.strip() +lf_lin_split['lineage_corrupt'].value_counts() +#%% Important sanity check for tidy_split(): lineage +# Split based on semi colon dr_muts_col +search = ";" + +# count of occurrence of ";" in dr_muts_col: No.of semicolons + 1 is no. of rows created * occurence +count_df_lin = gene_LF3_ColsSel[['lineage']].copy() +count_df_lin['lineage_semicolon_count'] = gene_LF3_ColsSel.loc[:, 'lineage'].str.count(search, re.I) +lin_sc_C = count_df_lin['lineage_semicolon_count'].value_counts().reset_index() +lin_sc_C + +lin_sc_C['index_semicolon'] = (lin_sc_C['index'] + 1) * lin_sc_C['lineage_semicolon_count'] +lin_sc_C +expected_linC = lin_sc_C['index_semicolon'].sum() + gene_LF3_ColsSel['lineage'].isnull().sum() +expected_linC + +if (len(lf_lin_split) == expected_linC): + print('\nPASS: tidy_split() length match for lineage') +else: + sys.exit('\nFAIL: tidy_split() length MISMATCH. Check lineage semicolon count') diff --git a/scripts/plotting/plotting_thesis/gid/gid_ORandSNP_results.R b/scripts/plotting/plotting_thesis/gid/gid_ORandSNP_results.R new file mode 100644 index 0000000..972def0 --- /dev/null +++ b/scripts/plotting/plotting_thesis/gid/gid_ORandSNP_results.R @@ -0,0 +1,204 @@ +#!/usr/bin/env Rscript +#source("~/git/LSHTM_analysis/config/gid.R") +#source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") + +#======= +# output +#======= +outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene), "/") +################################################################### +# FIXME: ADD distance to NA when SP replies + +geneL_normal = c("pnca") +geneL_na = c("gid", "rpob") +geneL_ppi2 = c("alr", "embb", "katg", "rpob") + +# LigDist_colname # from globals used +# ppi2Dist_colname #from globals used +# naDist_colname #from globals used + +common_cols = c("mutationinformation" + , "X5uhc_position" + , "X5uhc_offset" + , "position" + , "dst_mode" + , "mutation_info_labels" + , "sensitivity", dist_columns ) + + +######################################## +categ_cols_to_factor = grep( "_outcome|_info", colnames(merged_df3) ) +fact_cols = colnames(merged_df3)[categ_cols_to_factor] + +if (any(lapply(merged_df3[, fact_cols], class) == "character")){ + cat("\nChanging", length(categ_cols_to_factor), "cols to factor") + merged_df3[, fact_cols] <- lapply(merged_df3[, fact_cols], as.factor) + if (all(lapply(merged_df3[, fact_cols], class) == "factor")){ + cat("\nSuccessful: cols changed to factor") + } +}else{ + cat("\nRequested cols aready factors") +} + +cat("\ncols changed to factor are:\n", colnames(merged_df3)[categ_cols_to_factor] ) + +#################################### +# merged_df3: NECESSARY pre-processing +################################### +#df3 = merged_df3 +plot_cols = c("mutationinformation", "mutation_info_labels", "position", "dst_mode" + , all_cols) + +all_cols = c(common_cols + , all_stability_cols + , all_affinity_cols + , all_conserv_cols) + + +# counting +foo = merged_df3[, c("mutationinformation" + , "wild_pos" + , "position" + , "sensitivity" + , "avg_lig_affinity" + , "avg_lig_affinity_scaled" + , "avg_lig_affinity_outcome" + , "ligand_distance" + , "ligand_affinity_change" + , "affinity_scaled" + , "ligand_outcome" + , "consurf_colour_rev" + , "consurf_outcome")] + +table(foo$consurf_outcome) +foo2 = foo[foo$ligand_distance<10,] + +table(foo2$ligand_outcome) + +############################# +# wide plots SNP +# DRUG +length(aa_pos_drug); aa_pos_drug +drug = foo[foo$position%in%aa_pos_drug,] +drug$wild_pos +#CA + + +############################################### +# OR plot + +bar = merged_df3[, c("mutationinformation" + , "wild_pos" + , "position" + , "sensitivity" + , affinity_dist_colnames + , "or_mychisq" + , "pval_fisher" + #, "pval_chisq" + , "neglog_pval_fisher" + , "log10_or_mychisq")] + +# bar$p_adj_bonferroni = p.adjust(bar$pval_fisher, method = "bonferroni") +# bar$signif_bon = bar$p_adj_bonferroni +# bar = dplyr::mutate(bar +# , signif_bon = case_when(signif_bon == 0.05 ~ "." +# , signif_bon <=0.0001 ~ '****' +# , signif_bon <=0.001 ~ '***' +# , signif_bon <=0.01 ~ '**' +# , signif_bon <0.05 ~ '*' +# , TRUE ~ 'ns')) + +bar$p_adj_fdr = p.adjust(bar$pval_fisher, method = "BH") +bar$signif_fdr = bar$p_adj_fdr +bar = dplyr::mutate(bar + , signif_fdr = case_when(signif_fdr == 0.05 ~ "." + , signif_fdr <=0.0001 ~ '****' + , signif_fdr <=0.001 ~ '***' + , signif_fdr <=0.01 ~ '**' + , signif_fdr <0.05 ~ '*' + , TRUE ~ 'ns')) + +# sort df +bar = bar[order(bar$or_mychisq, decreasing = T), ] +bar = bar[, c("mutationinformation" + , "wild_pos" + , "position" + , "sensitivity" + , affinity_dist_colnames + , "or_mychisq" + #, "pval_fisher" + #, "pval_chisq" + #, "neglog_pval_fisher" + #, "log10_or_mychisq" + #, "signif_bon" + , "p_adj_fdr" + , "signif_fdr")] + +table(bar$sensitivity) + +table(bar$or_mychisq>1&bar$signif_fdr) # sen and res ~ OR + +str(bar) +sen = bar[bar$or_mychisq<1,] +sen = na.omit(sen) + +res = bar[bar$or_mychisq>1,] +res = na.omit(res) + +# comp +bar_or = bar[!is.na(bar$or_mychisq),] +table(bar_or$sensitivity) + +sen1 = bar_or[bar_or$or_mychisq<1,] # sen and res ~OR +res1 = bar_or[bar_or$or_mychisq>1,] # sen and res ~OR + +# sanity check +if (nrow(bar_or) == nrow(sen1) + nrow(res1) ){ + cat("\nPASS: df with or successfully sourced" + , "\nCalculating % of muts with OR>1") +}else{ + stop("Abort: df with or numbers mimatch") +} + +# percent for OR muts +pc_orR = nrow(res1)/(nrow(sen1) + nrow(res1)); pc_orR +cat("\nPercentage of muts with OR>1 i.e resistant:" + , pc_orR *100 ) + +# muts with highest OR +head(bar_or$mutationinformation, 10) + +# sort df +bar_or = bar_or[order(bar_or$or_mychisq + , bar_or$ligand_distance + , bar_or$interface_dist + , decreasing = T), ] + +bar_or$drug_site = ifelse(bar_or$position%in%aa_pos_drug, "drug", "no") +table(bar_or$drug_site) + +bar_or$dsl_site = ifelse(bar_or$position%in%aa_pos_dsl, "dsl", "no") +table(bar_or$dsl_site) + +bar_or$ca_site = ifelse(bar_or$position%in%aa_pos_ca, "ca", "no") +table(bar_or$ca_site) + +bar_or$cdl_site = ifelse(bar_or$position%in%aa_pos_cdl, "cdl", "no") +table(bar_or$cdl_site) + + +top10_or = bar_or[1:10,] + +# are these active sites +top10_or$position[top10_or$position%in%active_aa_pos] + + +# clostest most sig +bar_or_lig = bar_or[bar_or$ligand_distance<10,] +bar_or_lig = bar_or_lig[order(bar_or_lig$ligand_distance, -bar_or_lig$or_mychisq), ] +table(bar_or_lig$signif_fdr) + + +bar_or_ppi = bar_or[bar_or$interface_dist<10,] +bar_or_ppi = bar_or_ppi[order(bar_or_ppi$interface_dist, -bar_or_ppi$or_mychisq), ] +table(bar_or_ppi$signif_fdr) diff --git a/scripts/plotting/plotting_thesis/gid/gid_appendix_tables.R b/scripts/plotting/plotting_thesis/gid/gid_appendix_tables.R new file mode 100644 index 0000000..eb37c9f --- /dev/null +++ b/scripts/plotting/plotting_thesis/gid/gid_appendix_tables.R @@ -0,0 +1,426 @@ +#!/usr/bin/env Rscript + +source("~/git/LSHTM_analysis/config/gid.R") +source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") + +#======= +# output +#======= +outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene), "/") +outdir_stats = paste0(outdir_images,"stats/") +cat("\nOutput dir for stats:", outdir_stats) +################################################################### +geneL_normal = c("pnca") +#geneL_na = c("gid", "rpob") +geneL_na_v2 = c("gid") +geneL_nca = c("alr", "embb", "katg", "rpob") +geneL_both = c("rpob") + + +if (tolower(gene)%in%geneL_na_v2) { + gene_colnames = c("mcsm_na_affinity", "mcsm_na_outcome") +} + +if (tolower(gene)%in%geneL_nca) { + gene_colnames = c("mcsm_nca_affinity", "mcsm_nca_outcome") +} + +#from plotting_globals() +LigDist_colname +ppi2Dist_colname +naDist_colname + +delta_symbol #delta_symbol = "\u0394"; delta_symbol +angstroms_symbol + +cat("\nAffinity Distance colnames:", length(affinity_dist_colnames) + , "\nThese are:", affinity_dist_colnames) +#=========== +# Data used +#=========== +df3 = merged_df3 + +cols_to_output = c("position" + , "sensitivity" + , "mutationinformation" + , affinity_dist_colnames[1] + , "ligand_affinity_change" + , "ligand_outcome" + , "mmcsm_lig" + , "mmcsm_lig_outcome" + , affinity_dist_colnames[2] + # #, affinity_dist_colnames[3] + # , "mcsm_na_affinity" + # , "mcsm_na_outcome" + # #, "mcsm_nca_affinity" + # #, "mcsm_nca_outcome" + , gene_colnames + , "maf" + , "or_mychisq" + , "pval_fisher") + +cols_to_output +df3_output = df3[, cols_to_output] +colnames(df3_output) +cat("\nSelecting columns:", length(colnames(df3_output))) +#=============================================== +# Add COLS and rounding: adjusted P-values + MAF +#============================================== +#----------------------------- +# adjusted P-values +#----------------------------- +# add cols: p_adj_fdr and signif_fdr +df3_output$p_adj_fdr = p.adjust(df3_output$pval_fisher, method = "fdr") +df3_output$signif_fdr = df3_output$p_adj_fdr +df3_output = dplyr::mutate(df3_output + , signif_fdr = case_when(signif_fdr == 0.05 ~ "." + , signif_fdr <=0.0001 ~ '****' + , signif_fdr <=0.001 ~ '***' + , signif_fdr <=0.01 ~ '**' + , signif_fdr <0.05 ~ '*' + , TRUE ~ 'ns')) +# rounding +df3_output$or_mychisq = round(df3_output$or_mychisq,2) +df3_output$p_adj_fdr = round(df3_output$p_adj_fdr,2) +head(df3_output) + +#---------- +# MAF (%) +#---------- +# add col maf_percent +df3_output$maf_percent = df3_output$maf*100 + +# rounding +df3_output$maf_percent = round(df3_output$maf_percent,2) +head(df3_output$af); head(df3_output$maf);head(df3_output$maf_percent) + +#---------- +# P-value +#---------- +df3_output$pval_fisher = round(df3_output$pval_fisher,2) + +class(df3_output) +head(df3_output) + +#################################### +# Appendix: ligand affinity +#################################### +df_lig = df3_output[df3_output[[LigDist_colname]]