saving data extraction with final processing

This commit is contained in:
Tanushree Tunstall 2022-04-27 12:09:14 +01:00
parent 3c436f0c27
commit e419d320ac

View file

@ -71,7 +71,7 @@ arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', defau
arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for input and output', action='store_true')
arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for Data, input and output', action='store_true')
arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode')
@ -925,9 +925,9 @@ changes_dict = {}
##BROKENNNN!!!!
common_muts
gene_LF1['mutation'].head()
common_muts_lower = list((map(lambda x: x.lower(), common_muts)))
common_muts_lower
##BROKENNNN!!!!
#common_muts_lower = list((map(lambda x: x.lower(), common_muts)))
#common_muts_lower
for i in common_muts:
#for i in common_muts_lower:
#print(i)
@ -992,8 +992,7 @@ print('\n----------------------------------'
ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True)
#%% FIXME: Add sanity check to make sure you can add value_count checks
#%% Resolving ambiguous muts
# Merging ambiguous muts
#%% Resolving ambiguous muts: Merging ambiguous muts with the revised annotations
#=================
# Merge ambig muts
# with gene_LF1
@ -1034,7 +1033,7 @@ gene_LF1['mutation_info'].value_counts()
# reassign
#%% PHEW! Good to go for downstream stuff
#%% Add column: Mutationinformation
#%% Add column: Mutationinformation ==> gene_LF1
# splitting mutation column to get mCSM style muts
#=====================================================
# Formatting df: read aa dict and pull relevant info
@ -1098,7 +1097,8 @@ for k, v in my_aa_dict.items():
#gene_LF1['position'] = gene_LF1['mutation'].str.extract(r'(\d+)')
gene_LF1['position'] = gene_LF1['mutation'].str.extract(pos_regex)
mylen1 = len(gene_LF1.columns)
#mylen1 = len(gene_LF1.columns)
mylen0_v2 = len(gene_LF1.columns)
# sanity checks
print('checking if 3-letter wt&mut residue extraction worked correctly')
@ -1115,14 +1115,15 @@ else:
, '\nmutant-type\n', mut
, '\ndim of df:', gene_LF1.shape)
if mylen1 == mylen0 + ncol_mutf_add:
#if mylen1 == mylen0 + ncol_mutf_add:
if mylen0_v2 == mylen0 + ncol_mutf_add:
print('PASS: successfully added', ncol_mutf_add, 'cols'
, '\nold length:', mylen0
, '\nnew len:', mylen1)
, '\nnew len:', mylen0_v2)
else:
print('FAIL: failed to add cols:'
, '\nold length:', mylen0
, '\nnew len:', mylen1)
, '\nnew len:', mylen0_v2)
# clear variables
del(k, v, wt, mut, lookup_dict)
@ -1145,17 +1146,20 @@ gene_LF1['position'] = pd.to_numeric(gene_LF1['position'])
# sort by position inplace
foo = gene_LF1['position'].value_counts()
foo
foo = foo.sort_index()
gene_LF1.sort_values(by = ['position'], inplace = True)
bar = gene_LF1['position'].value_counts()
bar = bar.sort_index()
if all(foo == bar):
print('PASS: df ordered by position')
print(gene_LF1['position'].head())
else:
sys.exit('FAIL: df could not be ordered. Check source')
print('\nDim of gene_LF1:', len(gene_LF1.columns), 'more cols:\n')
# FIXME:Can only compare identically-labeled Series objects
#if (foo == bar).all():
# print('PASS: df ordered by position')
# print(gene_LF1['position'].head())
#else:
# print('FAIL: df could not be ordered. Check source')
# sys.exit()
#%% Create a copy of mutationinformation column for downstream mergeing
gene_LF1['Mut'] = gene_LF1['mutationinformation']
gene_LF1['Mut_copy'] = gene_LF1['mutationinformation']
@ -1185,8 +1189,12 @@ gene_LF1['pos_count'] = gene_LF1['position'].map(z1)
#test_df2 = test_df.loc[test_df['position'] == 10]
###############################################################################
cols_added = ['Mut', 'Mut_copy', 'index', 'index_copy', 'pos_count', 'snp_frequency']
print('\nAdded', len(cols_added), 'more cols:\n'
, '\nDim of new gene_LF1:', len(gene_LF1.columns))
mylen1 = len(gene_LF1.columns) # updated my_len1
###############################################################################
#%% Add column: aa property_water
#%% Add column: aa property_water ==> gene_LF1
#=========
# iterate through the dict, create a lookup dict that i.e
# lookup_dict = {three_letter_code: aa_prop_water}
@ -1235,7 +1243,7 @@ else:
# clear variables
del(k, v, wt, mut, lookup_dict)
#%% Add column: aa_prop_polarity
#%% Add column: aa_prop_polarity ==> gene_LF1
#========
# iterate through the dict, create a lookup dict that i.e
# lookup_dict = {three_letter_code: aa_prop_polarity}
@ -1283,7 +1291,7 @@ else:
# clear variables
del(k, v, wt, mut, lookup_dict)
#%% Add column: aa_calcprop
#%% Add column: aa_calcprop ==> gene_LF1
#========
# iterate through the dict, create a lookup dict that i.e
# lookup_dict = {three_letter_code: aa_calcprop}
@ -1923,12 +1931,12 @@ print('Finished writing:', outfile_pos
, '\nNo. of rows:', len(pos_only_sorted)
, '\nNo. of cols:', len(pos_only_sorted.columns)
, '\n============================================================='
, '\n\n\n')
, '\n')
del(out_filename_pos)
###############################################################################
#%% Quick summary output
print('============================================'
print('\n============================================'
, '\nQuick summary output for', drug, 'and' , gene.lower()
, '\n============================================'
, '\nTotal samples:', total_samples
@ -1942,18 +1950,17 @@ print('============================================'
, '\nPercentage of Sus and Res [Revised]', drug, 'samples:\n', gene_LF4['dst_mode'].value_counts(normalize = True)*100
, '\n'
, '\nTotal no. of unique snps:', len(snps_only)
, '\nTotal no. of unique snps:', dr_muts_rev.nunique()+other_muts_rev.nunique()
, '\nTotal no. of unique nsSNPs [check1: length of snps_only]:', len(snps_only)
, '\nTotal no.of unique dr muts:', dr_muts_rev.nunique() # ADD
, '\nTotal no.of unique other muts:', other_muts_rev.nunique()#ADD
, '\nTotal no.of unique dr muts:' , dr_muts_rev.nunique()
, '\nTotal no.of unique other muts:' , other_muts_rev.nunique()
, '\nTotal no. of unique nsSNPs [check2: dr_muts + other_muts]:', dr_muts_rev.nunique()+other_muts_rev.nunique()
, '\nTotal no.of unique missense muts:', gene_LF4['mutationinformation'].nunique()
, '\nTotal no.of unique nSNSPs [check3, gene_LF4]:', gene_LF4['mutationinformation'].nunique()
, '\nTotal no.of unique positions associated with missense muts:', gene_LF4['position'].nunique()
, '\nTotal no. of samples with missense muts:', len(gene_LF4)
, '\nTotal no. of unique samples with missense muts:', gene_LF4['id'].nunique()
, '\n')
, '\nTotal no. of samples with nsSNPs:', len(gene_LF4)
, '\nTotal no. of unique sample ids with nsSNPs:', gene_LF4['id'].nunique()
)
if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
print('\nTotal no.of samples with ambiguous muts:', len(inspect)