From 7a10b4f223131ef01ab954d3d33c476535eaa0b8 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 22 Apr 2022 17:51:11 +0100 Subject: [PATCH] phewwww! finally resolved counts for ambiguous muts --- scripts/data_extraction.py | 550 +++++++++++++++++++++++++++++++++---- 1 file changed, 504 insertions(+), 46 deletions(-) diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py index de3791e..ceab5d0 100644 --- a/scripts/data_extraction.py +++ b/scripts/data_extraction.py @@ -53,6 +53,9 @@ import numpy as np import argparse from statistics import mean, median, mode from statistics import multimode +# adding values for common keys +import itertools +import collections #======================================================================= #%% dir and local imports homedir = os.path.expanduser('~') @@ -370,7 +373,7 @@ all(meta_gene_all.index.values == meta_gene_all['index_orig'].values) all(meta_gene_all.index.values == meta_gene_all['index_orig_copy'].values) ############################################################################## #%% Important sanity checks: Dr muts column for tidy split(), nsSNPs, etc. -# Split based on semi colon dr_muts_col and other_muts_col +# Split based on semi colon dr_muts_col search = ";" # count of occurrence of ";" in dr_muts_col: No.of semicolons + 1 is no. of rows created * occurence @@ -387,13 +390,7 @@ expected_drC # count no. of nsSNPs and extract those nsSNPs count_df_dr['dr_geneSNP_count'] = meta_gene_dr.loc[:, dr_muts_col].str.count(nssnp_match, re.I) dr_gene_count1 = count_df_dr['dr_geneSNP_count'].sum() -# DEL: need to count by find_all -# dr_snps_df = meta_gene_dr.loc[:, dr_muts_col].str.extract(nssnp_match_captureG, re.I) -# dr_snps_df.columns = [dr_muts_col] -# dr_snps_countU = dr_snps_df[dr_muts_col].nunique() -# dr_snps_countU -# dr_snps_count_dict = dr_snps_df[dr_muts_col].value_counts().to_dict() -# dr_snps_count_dict + ############################################################################### # This is to find out how many samples have 1 and more than 1 mutation,so you # can use it to check if your data extraction process for dr_muts @@ -482,7 +479,7 @@ else: # Extract only the samples/rows with nssnp_match #dr_gene_WF0 = dr_WF0.loc[dr_WF0[dr_muts_col].str.contains(gene_match)] dr_gene_WF0 = dr_WF0.loc[dr_WF0[dr_muts_col].str.contains(nssnp_match, regex = True, case = False)] -dr_gene_WF0_v2 = dr_WF0.loc[dr_WF0[dr_muts_col].str.contains(nssnp_match_captureG, regex = True, case = False)] +#dr_gene_WF0_v2 = dr_WF0.loc[dr_WF0[dr_muts_col].str.contains(nssnp_match_captureG, regex = True, case = False)] print('Lengths after tidy split and extracting', nssnp_match, 'muts:' , '\nOld length:' , len(meta_gene_dr) @@ -491,33 +488,6 @@ print('Lengths after tidy split and extracting', nssnp_match, 'muts:' , '\nExpected len:', dr_gene_count , '\n=============================================================') -if len(dr_gene_WF0) == dr_gene_count: - print('PASS: length matches expected length' - , '\n===============================================================') -else: - sys.exit('FAIL: lengths mismatch') - -# count the freq of 'dr_muts' samples -dr_muts_df = dr_gene_WF0 [['id', dr_muts_col]] -print('Dim of dr_muts_df:', dr_muts_df.shape) - -# add freq column -dr_muts_df['dr_sample_freq'] = dr_muts_df.groupby('id')['id'].transform('count') -#dr_muts_df['dr_sample_freq'] = dr_muts_df.loc[dr_muts_df.groupby('id')].transform('count') -print('Revised dim of dr_muts_df:', dr_muts_df.shape) - -c1 = dr_muts_df.dr_sample_freq.value_counts() -print('Counting no. of sample frequency:\n', c1 - , '\n===================================================================') - -# sanity check: length of gene samples -if len(dr_gene_WF0) == c1.sum(): - print('PASS: WF data has expected length' - , '\nLength of dr_gene WFO:', c1.sum() - , '\n===============================================================') -else: - sys.exit('FAIL: length mismatch!') - # Important: Assign 'column name' on which split was performed as an extra column # This is so you can identify if mutations are dr_type or other in the final df dr_df = dr_gene_WF0.assign(mutation_info = dr_muts_col) @@ -525,9 +495,13 @@ print('Dim of dr_df:', dr_df.shape , '\n==============================================================' , '\nEnd of tidy split() on dr_muts, and added an extra column relecting mut_category' , '\n===============================================================') + +if other_muts_col in dr_df.columns: + print('Dropping:', other_muts_col, 'from WF gene dr_df') + dr_df = dr_df.drop([other_muts_col], axis = 1) ######################################################################## #%% Important sanity checks: other muts column for tidy split(), nsSNPs, etc. - +# Split based on semi colon on other_muts_col # count of occurrence of ";" in other_muts_col: No.of semicolons + 1 is no. of rows created * occurence count_df_other = meta_gene_other[['id', other_muts_col]] count_df_other['other_semicolon_count'] = meta_gene_other.loc[:, other_muts_col].str.count(search, re.I) @@ -540,13 +514,497 @@ expected_otherC # count no. of nsSNPs and extract those nsSNPs count_df_other['other_geneSNP_count'] = meta_gene_other.loc[:, other_muts_col].str.count(nssnp_match, re.I) +other_gene_count1 = count_df_other['other_geneSNP_count'].sum() -# DEL: need to count by find_all -# other_snps_df = meta_gene_other.loc[:, other_muts_col].str.extract(nssnp_match_captureG, re.I) -# other_snps_df -# other_snps_df.columns = [other_muts_col] -# other_snps_countU = other_snps_df[other_muts_col].nunique() -# other_snps_countU -# other_snps_count_dict = other_snps_df[other_muts_col].value_counts().to_dict() -# other_snps_count_dict -############################################################################### \ No newline at end of file +# This is to find out how many samples have 1 and more than 1 mutation,so you +# can use it to check if your data extraction process for dr_muts +# and other_muts has worked correctly AND also to check the dim of the +# final formatted data. +# This will have: unique COMBINATION of sample id and mutations + +#======== +# Second: counting mutations in other_muts_col column +#======== +print('Now counting WT &', nssnp_match, 'muts within the column:', other_muts_col) +# drop na and extract a clean df +clean_df = meta_data.dropna(subset=[other_muts_col]) + +# sanity check: count na +na_count = meta_data[other_muts_col].isna().sum() + +if len(clean_df) == (total_samples - na_count): + print('PASS: clean_df extracted: length is', len(clean_df) + , '\nNo.of NAs =', na_count, '/', total_samples + , '\n=========================================================') +else: + sys.exit('FAIL: Could not drop NAs') + +other_gene_count = 0 +wt_other = 0 +id_other = [] +id2_other = [] +other_gene_mutsL = [] + +for i, id in enumerate(clean_df.id): + #print (i, id) + #id_other.append(id) + #count_gene_other = clean_df[other_muts_col].iloc[i].count(gene_match) # can include stop muts + count_gene_other = len(re.findall(nssnp_match, clean_df[other_muts_col].iloc[i], re.IGNORECASE)) + gene_otherL = re.findall(nssnp_match, clean_df[other_muts_col].iloc[i], re.IGNORECASE) + #print(count_gene_other) + if count_gene_other > 0: + id_other.append(id) + if count_gene_other > 1: + id2_other.append(id) + #print(id, count_gene_other) + other_gene_count = other_gene_count + count_gene_other + other_gene_mutsL = other_gene_mutsL + gene_otherL + count_wt = clean_df[other_muts_col].iloc[i].count('WT') + wt_other = wt_other + count_wt + +print('RESULTS:') +print('Total WT in other_muts_col:', wt_other) +print('Total matches of', gene, 'SNP matches in', other_muts_col, ':', other_gene_count) +print('Total samples with > 1', gene, 'nsSNPs in other_muts_col:', len(id2_other) ) +print('Total matches of UNIQUE', gene, 'SNP matches in', other_muts_col, ':', len(set(other_gene_mutsL))) +print('=================================================================') + +if other_gene_count == other_gene_count1: + print('\nPass: other gene SNP count match') +else: + sys.exit('\nFAIL: other gene SNP count MISmatch') + +del(clean_df, na_count, i, id, wt_other, id2_other, count_gene_other, count_wt ) +#%% tidy_split(): other_muts_col +#========= +# DF2: other_muts_col +# and remove leading white spaces +#========= +col_to_split2 = other_muts_col +print ('applying second tidy split() separately on other muts df', meta_gene_other.shape + , '\ncolumn name to apply tidy_split():', col_to_split2 + , '\n============================================================') + +# apply tidy_split() +other_WF1 = tidy_split(meta_gene_other, col_to_split2, sep = ';') +# remove the leading white spaces in the column +other_WF1[other_muts_col] = other_WF1[other_muts_col].str.strip() + +# extract only the samples/rows with nssnp_match +#other_gene_WF1 = other_WF1.loc[other_WF1[other_muts_col].str.contains(gene_match)] +other_gene_WF1 = other_WF1.loc[other_WF1[other_muts_col].str.contains(nssnp_match, regex = True, case = False)] + +print('Lengths after tidy split and extracting', gene_match, 'muts:', + '\nOld length:' , len(meta_gene_other), + '\nLength after split:', len(other_WF1), + '\nLength of nssnp df:', len(other_gene_WF1), + '\nExpected len:', other_gene_count + , '\n=============================================================') + +# Important: Assign 'column name' on which split was performed as an extra column +# This is so you can identify if mutations are dr_type or other in the final df +other_df = other_gene_WF1.assign(mutation_info = other_muts_col) +print('dim of other_df:', other_df.shape + , '\n===============================================================' + , '\nEnd of tidy split() on other_muts, and added an extra column relecting mut_category' + , '\n===============================================================') + +if dr_muts_col in other_df.columns: + print('Dropping:', dr_muts_col, 'from WF gene other_df') + other_df = other_df.drop([dr_muts_col], axis = 1) +############################################################################### +#%% Finding ambiguous muts +common_snps_dr_other = set(dr_gene_mutsL).intersection(set(other_gene_mutsL)) + +#%% More sanity checks: expected unique snps and nrows in LF data +expected_unique_snps = len(set(dr_gene_mutsL)) + len(set(other_gene_mutsL)) - len(common_snps_dr_other) +expected_rows = dr_gene_count + other_gene_count + +#%% Useful results to note==> counting dr, other and common muts +print('\n===================================================================' + , '\nCount unique nsSNPs for', gene, ':' + , expected_unique_snps + , '\n===================================================================') + +Vcounts_dr = pd.Series(dr_gene_mutsL).value_counts() +Vcounts_common_dr = Vcounts_dr.get(list(common_snps_dr_other)) +print('\n===================================================================' + , "\nCount of samples for common muts in dr muts\n" + , Vcounts_common_dr + , '\n===================================================================') + +Vcounts_other = pd.Series(other_gene_mutsL).value_counts() +Vcounts_common_other = Vcounts_other.get(list(common_snps_dr_other)) + +print('\n===================================================================' + , '\nCount of samples for common muts in other muts\n' + , Vcounts_common_other + , '\n===================================================================') + +print('\n===================================================================' + , '\nPredicting total no. of rows in the curated df:', expected_rows + , '\n===================================================================') + +#%%another way: Add value checks for dict so you can know if its correct for LF data count below +dr_snps_vc_dict = pd.Series(dr_gene_mutsL).value_counts().to_dict() +other_snps_vc_dict = pd.Series(other_gene_mutsL).value_counts().to_dict() + +for k, v in dr_snps_vc_dict.items(): + if k in common_snps_dr_other: + print(k,v) + +for k, v in other_snps_vc_dict.items(): + if k in common_snps_dr_other: + print(k,v) + +# using defaultdict +Cdict = collections.defaultdict(int) + +# iterating key, val with chain() +for key, val in itertools.chain(dr_snps_vc_dict.items(), other_snps_vc_dict.items()): + if key in common_snps_dr_other: + Cdict[key] += val + else: + Cdict[key] = val + +print(dict(Cdict)) + +for k, v in Cdict.items(): + if k in common_snps_dr_other: + print(k,v) +############################################################################### +# USE Vcounts to get expected curated df +# resolve dm om muts funda +#%%#%% Concatenating two dfs: gene_LF0 +# equivalent of rbind in R +#========== +# Concatentating the two dfs: equivalent of rbind in R +#========== +# Important: Change column names to allow concat: +# dr_muts.. & other_muts : 'mutation' +print('Now concatenating the two dfs by row' + , '\nFirst assigning a common colname: "mutation" to the col containing muts' + , '\nThis is done for both dfs' + , '\n===================================================================') + +dr_df.columns +dr_df.rename(columns = {dr_muts_col: 'mutation'}, inplace = True) +dr_df.columns + +other_df.columns +other_df.rename(columns = {other_muts_col: 'mutation'}, inplace = True) +other_df.columns + +if len(dr_df.columns) == len(other_df.columns): + print('Checking dfs for concatening by rows:' + , '\nDim of dr_df:', dr_df.shape + , '\nDim of other_df:', other_df.shape + , '\nExpected nrows:', len(dr_df) + len(other_df) + , '\n=============================================================') +else: + sys.exit('FAIL: No. of cols mismatch for concatenating') + +# checking colnames before concat +print('Checking colnames BEFORE concatenating the two dfs...') +if (set(dr_df.columns) == set(other_df.columns)): + print('PASS: column names match necessary for merging two dfs') +else: + sys.exit('FAIL: Colnames mismatch for concatenating!') + +# concatenate (axis = 0): Rbind, adn keeo original index +print('Now appending the two dfs: Rbind') +gene_LF_comb = pd.concat([dr_df, other_df] + #, ignore_index = True + , axis = 0) + +if gene_LF_comb.index.nunique() == len(meta_gene_all.index): + print('\nPASS:length of index in LF data:', len(gene_LF_comb.index) + , '\nLength of unique index in LF data:', gene_LF_comb.index.nunique() + ) +else: + sys.exit('\nFAIL: expected length for combined LF data mismatch:' + , '\nExpected length:', len(meta_gene_all.index) + , '\nGot:', gene_LF_comb.index.nunique() ) + +print('Finding stop mutations in concatenated df') +stop_muts = gene_LF_comb['mutation'].str.contains('\*').sum() +if stop_muts == 0: + print('PASS: No stop mutations detected') +else: + print('stop mutations detected' + , '\nNo. of stop muts:', stop_muts, '\n' + , gene_LF_comb.groupby(['mutation_info'])['mutation'].apply(lambda x: x[x.str.contains('\*')].count()) + , '\nNow removing them') + +gene_LF0_nssnp = gene_LF_comb[~gene_LF_comb['mutation'].str.contains('\*')] +print('snps only: by subtracting stop muts:', len(gene_LF0_nssnp)) + +gene_LF0 = gene_LF_comb[gene_LF_comb['mutation'].str.contains(nssnp_match, case = False)] +print('snps only by direct regex:', len(gene_LF0)) + +if gene_LF0_nssnp.equals(gene_LF0): + print('PASS: regex for extracting nssnp worked correctly & stop mutations successfully removed' + , '\nUsing the regex extracted df') +else: + sys.exit('FAIL: posssibly regex or no of stop mutations' + , 'Regex being used:', nssnp_match) + #sys.exit() + +# checking colnames and length after concat +print('Checking colnames AFTER concatenating the two dfs...') +if (set(dr_df.columns) == set(gene_LF0.columns)): + print('PASS: column names match' + , '\n=============================================================') +else: + sys.exit('FAIL: Colnames mismatch AFTER concatenating') + +print('Checking concatenated df') +if len(gene_LF0) == (len(dr_df) + len(other_df))- stop_muts: + print('PASS:length of df after concat match' + , '\n===============================================================') +else: + sys.exit('FAIL: length mismatch') + +#%% NEW check2:id, sample, etc. +gene_LF0['id'].count() +gene_LF0['id'].nunique() +gene_LF0['sample'].nunique() +gene_LF0['mutation_info'].value_counts() +gene_LF0[drug].isna().sum() +#%% Concatenating two dfs sanity checks: gene_LF1 +#========================================================= +# This is hopefully clean data, but just double check +# Filter LF data so that you only have +# mutations corresponding to nssnp_match (case insensitive) +# this will be your list you run OR calcs +#========================================================= +print('Length of gene_LF0:', len(gene_LF0) + , '\nThis should be what we need. But just double checking and extracting nsSNP for', gene + , '\nfrom LF0 (concatenated data) using case insensitive regex match:', nssnp_match) + +gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(nssnp_match, regex = True, case = False)] + +if len(gene_LF0) == len(gene_LF1): + print('PASS: length of gene_LF0 and gene_LF1 match', + '\nConfirming extraction and concatenating worked correctly' + , '\n==========================================================') +else: + print('FAIL: BUT NOT FATAL!' + , '\ngene_LF0 and gene_LF1 lengths differ' + , '\nsuggesting error in extraction process' + , ' use gene_LF1 for downstreama analysis' + , '\n=========================================================') + +print('Length of dfs pre and post processing...' + , '\ngene WF data (unique samples in each row):',len(meta_gene_all) + , '\ngene LF data (unique mutation in each row):', len(gene_LF1) + , '\n=============================================================') + +# sanity check for extraction +# This ought to pass if nsnsp_match happens right at the beginning of creating 'expected_rows' +print('Verifying whether extraction process worked correctly...') +if len(gene_LF1) == expected_rows: + print('PASS: extraction process performed correctly' + , '\nExpected length:', expected_rows + , '\nGot:', len(gene_LF1) + , '\nRESULT: Total no. of mutant sequences for logo plot:', len(gene_LF1) + , '\n=========================================================') +else: + print('FAIL: extraction process has bugs' + , '\nExpected length:', expected_rows + , '\nGot:', len(gene_LF1) + , '\nDebug please' + , '\n=========================================================') + +# more sanity checks +print('Performing some more sanity checks...') + +# From LF1: useful for OR counts +# no. of unique muts +distinct_muts = gene_LF1.mutation.value_counts() +print('Distinct genomic mutations:', len(distinct_muts)) + +# no. of samples contributing the unique muts +gene_LF1.id.nunique() +print('No.of samples contributing to distinct genomic muts:', gene_LF1.id.nunique()) + +# no. of dr and other muts +mut_grouped = gene_LF1.groupby('mutation_info').mutation.nunique() +print('No.of unique dr and other muts:\n', gene_LF1.groupby('mutation_info').mutation.nunique()) + +# sanity check +if len(distinct_muts) == mut_grouped.sum() : + print('PASS:length of LF1 is as expected' + , '\n===============================================================') +else: + print('FAIL: Mistmatch in count of muts' + , '\nExpected count:', len(distinct_muts) + , '\nActual count:', mut_grouped.sum() + , '\nMuts should be distinct within dr* and other* type' + , '\nInspecting...possibly ambiguous muts' + , '\nNo. of possible ambiguous muts:', mut_grouped.sum() - len(distinct_muts) + , '\n=========================================================') + +muts_split = list(gene_LF1.groupby('mutation_info')) +dr_muts = muts_split[0][1].mutation +other_muts = muts_split[1][1].mutation +print('splitting muts by mut_info:', muts_split) +print('no.of dr_muts samples:', len(dr_muts)) +print('no. of other_muts samples', len(other_muts)) +#%% Ambiguous muts +# IMPORTANT: The same mutation cannot be classed as a drug AND 'others' +if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: + print('WARNING: Ambiguous muts detected in dr_ and other_ mutation category' + , '\n===============================================================') +else: + print('PASS: NO ambiguous muts detected' + , '\nMuts are unique to dr_ and other_ mutation class' + , '\n=========================================================') + +# inspect dr_muts and other muts: Fixed in case no ambiguous muts detected! +if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: + print('Finding ambiguous muts...' + , '\n=========================================================' + , '\nTotal no. of samples in dr_muts present in other_muts:', dr_muts.isin(other_muts).sum() + , '\nThese are:', dr_muts[dr_muts.isin(other_muts)] + , '\n=========================================================' + , '\nTotal no. of samples in other_muts present in dr_muts:', other_muts.isin(dr_muts).sum() + , '\nThese are:\n', other_muts[other_muts.isin(dr_muts)] + , '\n=========================================================') + + print('Counting no. of ambiguous muts...' + , '\nNo. of ambiguous muts in dr:' + , len(dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist()) + , '\nNo. of ambiguous muts in other:' + , len(other_muts[other_muts.isin(dr_muts)].value_counts().keys().tolist()) + , '\n=========================================================') + + if dr_muts[dr_muts.isin(other_muts)].nunique() == other_muts[other_muts.isin(dr_muts)].nunique(): + common_muts = dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist() + print('Distinct no. of ambigiuous muts detected:'+ str(len(common_muts)) + , '\nlist of ambiguous mutations (see below):', *common_muts, sep = '\n') + print('\n===========================================================') +else: + #sys.exit('Error: ambiguous muts present, but extraction failed. Debug!') + print('No: ambiguous muts present') + +#%% Ambiguous muts: revised annotation for mutation_info +ambiguous_muts_df = gene_LF1[gene_LF1['mutation'].isin(common_muts)] +ambiguous_muts_value_counts = ambiguous_muts_df.groupby('mutation')['mutation_info'].value_counts() +ambiguous_muts_value_counts + +gene_LF1_orig = gene_LF1.copy() +gene_LF1_orig.equals(gene_LF1) + +# copy the old columns for checking +gene_LF1['mutation_info_orig'] = gene_LF1['mutation_info'] +gene_LF1['mutation_info_v1'] = gene_LF1['mutation_info'] +gene_LF1['mutation_info'].value_counts() +#%% Inspect ambiguous muts +#===================================== +# Now create a new df that will have: + # ambiguous muts + # mutation_info + # revised mutation_info +# The revised label is based on value_counts +# for mutaiton_info. The corresponding mutation_info: +# label is chosen that corresponds to the max of value counts +#===================================== +ambig_muts_rev_df = pd.DataFrame() +changes_val = [] +changes_dict = {} + +for i in common_muts: + #print(i) + temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']] + + # DANGER: ASSUMES TWO STATES ONLY and that value_counts sorts by descending + max_info_table = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']].value_counts() + + revised_label = max_info_table[[0]].index[0][1] # max value_count + old_label = max_info_table[[1]].index[0][1] # min value_count + print('\nAmbiguous mutation labels...' + , '\nSetting mutation_info for', i, 'to', revised_label) + temp_df['mutation_info_REV'] = np.where( (temp_df['mutation_info_orig'] == old_label) + , revised_label + , temp_df['mutation_info_orig']) + ambig_muts_rev_df = pd.concat([ambig_muts_rev_df,temp_df]) + f = max_info_table[[1]] + old_label_count = f[[0]][0] + changes_val.append(old_label_count) + cc_dict = f.to_dict() + changes_dict.update(cc_dict) + +ambig_muts_rev_df['mutation_info_REV'].value_counts() +ambig_muts_rev_df['mutation_info_orig'].value_counts() +changes_val +changes_total = sum(changes_val) +changes_dict +#%% OUTFILE 3, write file: ambiguous muts and ambiguous mut counts +#=================== +# ambiguous muts +#======================= +#dr_muts.to_csv('dr_muts.csv', header = True) +#other_muts.to_csv('other_muts.csv', header = True) +if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: + out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv' + outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts + print('\n----------------------------------' + , '\nWriting file: ambiguous muts' + , '\n----------------------------------' + , '\nFilename:', outfile_ambig_muts) + inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)] + inspect.to_csv(outfile_ambig_muts, index = True) + + print('Finished writing:', out_filename_ambig_muts + , '\nNo. of rows:', len(inspect) + , '\nNo. of cols:', len(inspect.columns) + , '\nNo. of rows = no. of samples with the ambiguous muts present:' + , dr_muts.isin(other_muts).sum() + other_muts.isin(dr_muts).sum() + , '\n=============================================================') + del(out_filename_ambig_muts) + +#%% FIXME: ambig mut counts +#======================= +# ambiguous mut counts +#======================= +out_filename_ambig_mut_counts = gene.lower() + '_ambiguous_mut_counts.csv' +outfile_ambig_mut_counts = outdir + '/' + out_filename_ambig_mut_counts +print('\n----------------------------------' + , '\nWriting file: ambiguous muts' + , '\n----------------------------------' + , '\nFilename:', outfile_ambig_mut_counts) + +ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True) +#%%FIXME: TODO: Add sanity check to make sure you can add value_count checks +#%% Resolving ambiguous muts +# Merging ambiguous muts +#================= +# Merge ambig muts +# with gene_LF1 +#=================== +ambig_muts_rev_df.index +gene_LF1.index +all(ambig_muts_rev_df.index.isin(gene_LF1.index)) + +gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info_v1'] = ambig_muts_rev_df['mutation_info_REV'] +gene_LF1['mutation_info_orig'].value_counts() +gene_LF1['mutation_info_v1'].value_counts() +foo = gene_LF1.iloc[ambig_muts_rev_df.index] + +# Sanity check1: if there are still any ambiguous muts +muts_split_rev = list(gene_LF1.groupby('mutation_info_v1')) +dr_muts_rev = muts_split_rev[0][1].mutation +other_muts_rev = muts_split_rev[1][1].mutation +print('splitting muts by mut_info:', muts_split_rev) +print('no.of dr_muts samples:', len(dr_muts_rev)) +print('no. of other_muts samples', len(other_muts_rev)) + +if not dr_muts_rev.isin(other_muts_rev).sum() & other_muts_rev.isin(dr_muts_rev).sum() > 0: + print('\nAmbiguous muts corrected. Proceeding with downstream analysis') +else: + print('\nAmbiguous muts NOT corrected. Quitting!') + sys.exit() + +gene_LF1['mutation_info_v1'].value_counts() +#%% PHEW! Good to go for downstream stuff \ No newline at end of file