saving data extraction with final processing
This commit is contained in:
parent
3c436f0c27
commit
e419d320ac
1 changed files with 42 additions and 35 deletions
|
@ -71,7 +71,7 @@ arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', defau
|
||||||
arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
|
arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
|
||||||
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
|
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
|
||||||
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
|
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
|
||||||
arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for input and output', action='store_true')
|
arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for Data, input and output', action='store_true')
|
||||||
|
|
||||||
arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode')
|
arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode')
|
||||||
|
|
||||||
|
@ -925,9 +925,9 @@ changes_dict = {}
|
||||||
##BROKENNNN!!!!
|
##BROKENNNN!!!!
|
||||||
common_muts
|
common_muts
|
||||||
gene_LF1['mutation'].head()
|
gene_LF1['mutation'].head()
|
||||||
common_muts_lower = list((map(lambda x: x.lower(), common_muts)))
|
#common_muts_lower = list((map(lambda x: x.lower(), common_muts)))
|
||||||
common_muts_lower
|
#common_muts_lower
|
||||||
##BROKENNNN!!!!
|
|
||||||
for i in common_muts:
|
for i in common_muts:
|
||||||
#for i in common_muts_lower:
|
#for i in common_muts_lower:
|
||||||
#print(i)
|
#print(i)
|
||||||
|
@ -992,8 +992,7 @@ print('\n----------------------------------'
|
||||||
|
|
||||||
ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True)
|
ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True)
|
||||||
#%% FIXME: Add sanity check to make sure you can add value_count checks
|
#%% FIXME: Add sanity check to make sure you can add value_count checks
|
||||||
#%% Resolving ambiguous muts
|
#%% Resolving ambiguous muts: Merging ambiguous muts with the revised annotations
|
||||||
# Merging ambiguous muts
|
|
||||||
#=================
|
#=================
|
||||||
# Merge ambig muts
|
# Merge ambig muts
|
||||||
# with gene_LF1
|
# with gene_LF1
|
||||||
|
@ -1034,7 +1033,7 @@ gene_LF1['mutation_info'].value_counts()
|
||||||
|
|
||||||
# reassign
|
# reassign
|
||||||
#%% PHEW! Good to go for downstream stuff
|
#%% PHEW! Good to go for downstream stuff
|
||||||
#%% Add column: Mutationinformation
|
#%% Add column: Mutationinformation ==> gene_LF1
|
||||||
# splitting mutation column to get mCSM style muts
|
# splitting mutation column to get mCSM style muts
|
||||||
#=====================================================
|
#=====================================================
|
||||||
# Formatting df: read aa dict and pull relevant info
|
# Formatting df: read aa dict and pull relevant info
|
||||||
|
@ -1098,7 +1097,8 @@ for k, v in my_aa_dict.items():
|
||||||
#gene_LF1['position'] = gene_LF1['mutation'].str.extract(r'(\d+)')
|
#gene_LF1['position'] = gene_LF1['mutation'].str.extract(r'(\d+)')
|
||||||
gene_LF1['position'] = gene_LF1['mutation'].str.extract(pos_regex)
|
gene_LF1['position'] = gene_LF1['mutation'].str.extract(pos_regex)
|
||||||
|
|
||||||
mylen1 = len(gene_LF1.columns)
|
#mylen1 = len(gene_LF1.columns)
|
||||||
|
mylen0_v2 = len(gene_LF1.columns)
|
||||||
|
|
||||||
# sanity checks
|
# sanity checks
|
||||||
print('checking if 3-letter wt&mut residue extraction worked correctly')
|
print('checking if 3-letter wt&mut residue extraction worked correctly')
|
||||||
|
@ -1115,14 +1115,15 @@ else:
|
||||||
, '\nmutant-type\n', mut
|
, '\nmutant-type\n', mut
|
||||||
, '\ndim of df:', gene_LF1.shape)
|
, '\ndim of df:', gene_LF1.shape)
|
||||||
|
|
||||||
if mylen1 == mylen0 + ncol_mutf_add:
|
#if mylen1 == mylen0 + ncol_mutf_add:
|
||||||
|
if mylen0_v2 == mylen0 + ncol_mutf_add:
|
||||||
print('PASS: successfully added', ncol_mutf_add, 'cols'
|
print('PASS: successfully added', ncol_mutf_add, 'cols'
|
||||||
, '\nold length:', mylen0
|
, '\nold length:', mylen0
|
||||||
, '\nnew len:', mylen1)
|
, '\nnew len:', mylen0_v2)
|
||||||
else:
|
else:
|
||||||
print('FAIL: failed to add cols:'
|
print('FAIL: failed to add cols:'
|
||||||
, '\nold length:', mylen0
|
, '\nold length:', mylen0
|
||||||
, '\nnew len:', mylen1)
|
, '\nnew len:', mylen0_v2)
|
||||||
|
|
||||||
# clear variables
|
# clear variables
|
||||||
del(k, v, wt, mut, lookup_dict)
|
del(k, v, wt, mut, lookup_dict)
|
||||||
|
@ -1137,7 +1138,7 @@ print('Created column: mutationinformation'
|
||||||
, '\n=====================================================================\n'
|
, '\n=====================================================================\n'
|
||||||
, gene_LF1.mutationinformation.head(10))
|
, gene_LF1.mutationinformation.head(10))
|
||||||
|
|
||||||
#order by position for convenience
|
# order by position for convenience
|
||||||
gene_LF1.dtypes
|
gene_LF1.dtypes
|
||||||
|
|
||||||
# converting position to numeric
|
# converting position to numeric
|
||||||
|
@ -1145,17 +1146,20 @@ gene_LF1['position'] = pd.to_numeric(gene_LF1['position'])
|
||||||
|
|
||||||
# sort by position inplace
|
# sort by position inplace
|
||||||
foo = gene_LF1['position'].value_counts()
|
foo = gene_LF1['position'].value_counts()
|
||||||
foo
|
foo = foo.sort_index()
|
||||||
|
|
||||||
gene_LF1.sort_values(by = ['position'], inplace = True)
|
gene_LF1.sort_values(by = ['position'], inplace = True)
|
||||||
bar = gene_LF1['position'].value_counts()
|
bar = gene_LF1['position'].value_counts()
|
||||||
|
bar = bar.sort_index()
|
||||||
|
|
||||||
|
if all(foo == bar):
|
||||||
|
print('PASS: df ordered by position')
|
||||||
|
print(gene_LF1['position'].head())
|
||||||
|
else:
|
||||||
|
sys.exit('FAIL: df could not be ordered. Check source')
|
||||||
|
|
||||||
|
print('\nDim of gene_LF1:', len(gene_LF1.columns), 'more cols:\n')
|
||||||
|
|
||||||
# FIXME:Can only compare identically-labeled Series objects
|
|
||||||
#if (foo == bar).all():
|
|
||||||
# print('PASS: df ordered by position')
|
|
||||||
# print(gene_LF1['position'].head())
|
|
||||||
#else:
|
|
||||||
# print('FAIL: df could not be ordered. Check source')
|
|
||||||
# sys.exit()
|
|
||||||
#%% Create a copy of mutationinformation column for downstream mergeing
|
#%% Create a copy of mutationinformation column for downstream mergeing
|
||||||
gene_LF1['Mut'] = gene_LF1['mutationinformation']
|
gene_LF1['Mut'] = gene_LF1['mutationinformation']
|
||||||
gene_LF1['Mut_copy'] = gene_LF1['mutationinformation']
|
gene_LF1['Mut_copy'] = gene_LF1['mutationinformation']
|
||||||
|
@ -1185,8 +1189,12 @@ gene_LF1['pos_count'] = gene_LF1['position'].map(z1)
|
||||||
|
|
||||||
#test_df2 = test_df.loc[test_df['position'] == 10]
|
#test_df2 = test_df.loc[test_df['position'] == 10]
|
||||||
###############################################################################
|
###############################################################################
|
||||||
|
cols_added = ['Mut', 'Mut_copy', 'index', 'index_copy', 'pos_count', 'snp_frequency']
|
||||||
|
print('\nAdded', len(cols_added), 'more cols:\n'
|
||||||
|
, '\nDim of new gene_LF1:', len(gene_LF1.columns))
|
||||||
|
mylen1 = len(gene_LF1.columns) # updated my_len1
|
||||||
###############################################################################
|
###############################################################################
|
||||||
#%% Add column: aa property_water
|
#%% Add column: aa property_water ==> gene_LF1
|
||||||
#=========
|
#=========
|
||||||
# iterate through the dict, create a lookup dict that i.e
|
# iterate through the dict, create a lookup dict that i.e
|
||||||
# lookup_dict = {three_letter_code: aa_prop_water}
|
# lookup_dict = {three_letter_code: aa_prop_water}
|
||||||
|
@ -1235,7 +1243,7 @@ else:
|
||||||
# clear variables
|
# clear variables
|
||||||
del(k, v, wt, mut, lookup_dict)
|
del(k, v, wt, mut, lookup_dict)
|
||||||
|
|
||||||
#%% Add column: aa_prop_polarity
|
#%% Add column: aa_prop_polarity ==> gene_LF1
|
||||||
#========
|
#========
|
||||||
# iterate through the dict, create a lookup dict that i.e
|
# iterate through the dict, create a lookup dict that i.e
|
||||||
# lookup_dict = {three_letter_code: aa_prop_polarity}
|
# lookup_dict = {three_letter_code: aa_prop_polarity}
|
||||||
|
@ -1283,7 +1291,7 @@ else:
|
||||||
|
|
||||||
# clear variables
|
# clear variables
|
||||||
del(k, v, wt, mut, lookup_dict)
|
del(k, v, wt, mut, lookup_dict)
|
||||||
#%% Add column: aa_calcprop
|
#%% Add column: aa_calcprop ==> gene_LF1
|
||||||
#========
|
#========
|
||||||
# iterate through the dict, create a lookup dict that i.e
|
# iterate through the dict, create a lookup dict that i.e
|
||||||
# lookup_dict = {three_letter_code: aa_calcprop}
|
# lookup_dict = {three_letter_code: aa_calcprop}
|
||||||
|
@ -1923,12 +1931,12 @@ print('Finished writing:', outfile_pos
|
||||||
, '\nNo. of rows:', len(pos_only_sorted)
|
, '\nNo. of rows:', len(pos_only_sorted)
|
||||||
, '\nNo. of cols:', len(pos_only_sorted.columns)
|
, '\nNo. of cols:', len(pos_only_sorted.columns)
|
||||||
, '\n============================================================='
|
, '\n============================================================='
|
||||||
, '\n\n\n')
|
, '\n')
|
||||||
|
|
||||||
del(out_filename_pos)
|
del(out_filename_pos)
|
||||||
###############################################################################
|
###############################################################################
|
||||||
#%% Quick summary output
|
#%% Quick summary output
|
||||||
print('============================================'
|
print('\n============================================'
|
||||||
, '\nQuick summary output for', drug, 'and' , gene.lower()
|
, '\nQuick summary output for', drug, 'and' , gene.lower()
|
||||||
, '\n============================================'
|
, '\n============================================'
|
||||||
, '\nTotal samples:', total_samples
|
, '\nTotal samples:', total_samples
|
||||||
|
@ -1942,18 +1950,17 @@ print('============================================'
|
||||||
, '\nPercentage of Sus and Res [Revised]', drug, 'samples:\n', gene_LF4['dst_mode'].value_counts(normalize = True)*100
|
, '\nPercentage of Sus and Res [Revised]', drug, 'samples:\n', gene_LF4['dst_mode'].value_counts(normalize = True)*100
|
||||||
, '\n'
|
, '\n'
|
||||||
|
|
||||||
, '\nTotal no. of unique snps:', len(snps_only)
|
, '\nTotal no. of unique nsSNPs [check1: length of snps_only]:', len(snps_only)
|
||||||
, '\nTotal no. of unique snps:', dr_muts_rev.nunique()+other_muts_rev.nunique()
|
|
||||||
|
|
||||||
, '\nTotal no.of unique dr muts:', dr_muts_rev.nunique() # ADD
|
, '\nTotal no.of unique dr muts:' , dr_muts_rev.nunique()
|
||||||
, '\nTotal no.of unique other muts:', other_muts_rev.nunique()#ADD
|
, '\nTotal no.of unique other muts:' , other_muts_rev.nunique()
|
||||||
|
, '\nTotal no. of unique nsSNPs [check2: dr_muts + other_muts]:', dr_muts_rev.nunique()+other_muts_rev.nunique()
|
||||||
|
|
||||||
|
, '\nTotal no.of unique nSNSPs [check3, gene_LF4]:', gene_LF4['mutationinformation'].nunique()
|
||||||
, '\nTotal no.of unique missense muts:', gene_LF4['mutationinformation'].nunique()
|
|
||||||
, '\nTotal no.of unique positions associated with missense muts:', gene_LF4['position'].nunique()
|
, '\nTotal no.of unique positions associated with missense muts:', gene_LF4['position'].nunique()
|
||||||
, '\nTotal no. of samples with missense muts:', len(gene_LF4)
|
, '\nTotal no. of samples with nsSNPs:', len(gene_LF4)
|
||||||
, '\nTotal no. of unique samples with missense muts:', gene_LF4['id'].nunique()
|
, '\nTotal no. of unique sample ids with nsSNPs:', gene_LF4['id'].nunique()
|
||||||
, '\n')
|
)
|
||||||
|
|
||||||
if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
|
if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
|
||||||
print('\nTotal no.of samples with ambiguous muts:', len(inspect)
|
print('\nTotal no.of samples with ambiguous muts:', len(inspect)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue