saving data extraction with final processing
This commit is contained in:
parent
3c436f0c27
commit
e419d320ac
1 changed files with 42 additions and 35 deletions
|
@ -71,7 +71,7 @@ arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', defau
|
|||
arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
|
||||
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
|
||||
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
|
||||
arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for input and output', action='store_true')
|
||||
arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for Data, input and output', action='store_true')
|
||||
|
||||
arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode')
|
||||
|
||||
|
@ -925,9 +925,9 @@ changes_dict = {}
|
|||
##BROKENNNN!!!!
|
||||
common_muts
|
||||
gene_LF1['mutation'].head()
|
||||
common_muts_lower = list((map(lambda x: x.lower(), common_muts)))
|
||||
common_muts_lower
|
||||
##BROKENNNN!!!!
|
||||
#common_muts_lower = list((map(lambda x: x.lower(), common_muts)))
|
||||
#common_muts_lower
|
||||
|
||||
for i in common_muts:
|
||||
#for i in common_muts_lower:
|
||||
#print(i)
|
||||
|
@ -992,8 +992,7 @@ print('\n----------------------------------'
|
|||
|
||||
ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True)
|
||||
#%% FIXME: Add sanity check to make sure you can add value_count checks
|
||||
#%% Resolving ambiguous muts
|
||||
# Merging ambiguous muts
|
||||
#%% Resolving ambiguous muts: Merging ambiguous muts with the revised annotations
|
||||
#=================
|
||||
# Merge ambig muts
|
||||
# with gene_LF1
|
||||
|
@ -1034,7 +1033,7 @@ gene_LF1['mutation_info'].value_counts()
|
|||
|
||||
# reassign
|
||||
#%% PHEW! Good to go for downstream stuff
|
||||
#%% Add column: Mutationinformation
|
||||
#%% Add column: Mutationinformation ==> gene_LF1
|
||||
# splitting mutation column to get mCSM style muts
|
||||
#=====================================================
|
||||
# Formatting df: read aa dict and pull relevant info
|
||||
|
@ -1098,7 +1097,8 @@ for k, v in my_aa_dict.items():
|
|||
#gene_LF1['position'] = gene_LF1['mutation'].str.extract(r'(\d+)')
|
||||
gene_LF1['position'] = gene_LF1['mutation'].str.extract(pos_regex)
|
||||
|
||||
mylen1 = len(gene_LF1.columns)
|
||||
#mylen1 = len(gene_LF1.columns)
|
||||
mylen0_v2 = len(gene_LF1.columns)
|
||||
|
||||
# sanity checks
|
||||
print('checking if 3-letter wt&mut residue extraction worked correctly')
|
||||
|
@ -1115,14 +1115,15 @@ else:
|
|||
, '\nmutant-type\n', mut
|
||||
, '\ndim of df:', gene_LF1.shape)
|
||||
|
||||
if mylen1 == mylen0 + ncol_mutf_add:
|
||||
#if mylen1 == mylen0 + ncol_mutf_add:
|
||||
if mylen0_v2 == mylen0 + ncol_mutf_add:
|
||||
print('PASS: successfully added', ncol_mutf_add, 'cols'
|
||||
, '\nold length:', mylen0
|
||||
, '\nnew len:', mylen1)
|
||||
, '\nnew len:', mylen0_v2)
|
||||
else:
|
||||
print('FAIL: failed to add cols:'
|
||||
, '\nold length:', mylen0
|
||||
, '\nnew len:', mylen1)
|
||||
, '\nnew len:', mylen0_v2)
|
||||
|
||||
# clear variables
|
||||
del(k, v, wt, mut, lookup_dict)
|
||||
|
@ -1145,17 +1146,20 @@ gene_LF1['position'] = pd.to_numeric(gene_LF1['position'])
|
|||
|
||||
# sort by position inplace
|
||||
foo = gene_LF1['position'].value_counts()
|
||||
foo
|
||||
foo = foo.sort_index()
|
||||
|
||||
gene_LF1.sort_values(by = ['position'], inplace = True)
|
||||
bar = gene_LF1['position'].value_counts()
|
||||
bar = bar.sort_index()
|
||||
|
||||
if all(foo == bar):
|
||||
print('PASS: df ordered by position')
|
||||
print(gene_LF1['position'].head())
|
||||
else:
|
||||
sys.exit('FAIL: df could not be ordered. Check source')
|
||||
|
||||
print('\nDim of gene_LF1:', len(gene_LF1.columns), 'more cols:\n')
|
||||
|
||||
# FIXME:Can only compare identically-labeled Series objects
|
||||
#if (foo == bar).all():
|
||||
# print('PASS: df ordered by position')
|
||||
# print(gene_LF1['position'].head())
|
||||
#else:
|
||||
# print('FAIL: df could not be ordered. Check source')
|
||||
# sys.exit()
|
||||
#%% Create a copy of mutationinformation column for downstream mergeing
|
||||
gene_LF1['Mut'] = gene_LF1['mutationinformation']
|
||||
gene_LF1['Mut_copy'] = gene_LF1['mutationinformation']
|
||||
|
@ -1185,8 +1189,12 @@ gene_LF1['pos_count'] = gene_LF1['position'].map(z1)
|
|||
|
||||
#test_df2 = test_df.loc[test_df['position'] == 10]
|
||||
###############################################################################
|
||||
cols_added = ['Mut', 'Mut_copy', 'index', 'index_copy', 'pos_count', 'snp_frequency']
|
||||
print('\nAdded', len(cols_added), 'more cols:\n'
|
||||
, '\nDim of new gene_LF1:', len(gene_LF1.columns))
|
||||
mylen1 = len(gene_LF1.columns) # updated my_len1
|
||||
###############################################################################
|
||||
#%% Add column: aa property_water
|
||||
#%% Add column: aa property_water ==> gene_LF1
|
||||
#=========
|
||||
# iterate through the dict, create a lookup dict that i.e
|
||||
# lookup_dict = {three_letter_code: aa_prop_water}
|
||||
|
@ -1235,7 +1243,7 @@ else:
|
|||
# clear variables
|
||||
del(k, v, wt, mut, lookup_dict)
|
||||
|
||||
#%% Add column: aa_prop_polarity
|
||||
#%% Add column: aa_prop_polarity ==> gene_LF1
|
||||
#========
|
||||
# iterate through the dict, create a lookup dict that i.e
|
||||
# lookup_dict = {three_letter_code: aa_prop_polarity}
|
||||
|
@ -1283,7 +1291,7 @@ else:
|
|||
|
||||
# clear variables
|
||||
del(k, v, wt, mut, lookup_dict)
|
||||
#%% Add column: aa_calcprop
|
||||
#%% Add column: aa_calcprop ==> gene_LF1
|
||||
#========
|
||||
# iterate through the dict, create a lookup dict that i.e
|
||||
# lookup_dict = {three_letter_code: aa_calcprop}
|
||||
|
@ -1923,12 +1931,12 @@ print('Finished writing:', outfile_pos
|
|||
, '\nNo. of rows:', len(pos_only_sorted)
|
||||
, '\nNo. of cols:', len(pos_only_sorted.columns)
|
||||
, '\n============================================================='
|
||||
, '\n\n\n')
|
||||
, '\n')
|
||||
|
||||
del(out_filename_pos)
|
||||
###############################################################################
|
||||
#%% Quick summary output
|
||||
print('============================================'
|
||||
print('\n============================================'
|
||||
, '\nQuick summary output for', drug, 'and' , gene.lower()
|
||||
, '\n============================================'
|
||||
, '\nTotal samples:', total_samples
|
||||
|
@ -1942,18 +1950,17 @@ print('============================================'
|
|||
, '\nPercentage of Sus and Res [Revised]', drug, 'samples:\n', gene_LF4['dst_mode'].value_counts(normalize = True)*100
|
||||
, '\n'
|
||||
|
||||
, '\nTotal no. of unique snps:', len(snps_only)
|
||||
, '\nTotal no. of unique snps:', dr_muts_rev.nunique()+other_muts_rev.nunique()
|
||||
, '\nTotal no. of unique nsSNPs [check1: length of snps_only]:', len(snps_only)
|
||||
|
||||
, '\nTotal no.of unique dr muts:', dr_muts_rev.nunique() # ADD
|
||||
, '\nTotal no.of unique other muts:', other_muts_rev.nunique()#ADD
|
||||
, '\nTotal no.of unique dr muts:' , dr_muts_rev.nunique()
|
||||
, '\nTotal no.of unique other muts:' , other_muts_rev.nunique()
|
||||
, '\nTotal no. of unique nsSNPs [check2: dr_muts + other_muts]:', dr_muts_rev.nunique()+other_muts_rev.nunique()
|
||||
|
||||
|
||||
, '\nTotal no.of unique missense muts:', gene_LF4['mutationinformation'].nunique()
|
||||
, '\nTotal no.of unique nSNSPs [check3, gene_LF4]:', gene_LF4['mutationinformation'].nunique()
|
||||
, '\nTotal no.of unique positions associated with missense muts:', gene_LF4['position'].nunique()
|
||||
, '\nTotal no. of samples with missense muts:', len(gene_LF4)
|
||||
, '\nTotal no. of unique samples with missense muts:', gene_LF4['id'].nunique()
|
||||
, '\n')
|
||||
, '\nTotal no. of samples with nsSNPs:', len(gene_LF4)
|
||||
, '\nTotal no. of unique sample ids with nsSNPs:', gene_LF4['id'].nunique()
|
||||
)
|
||||
|
||||
if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
|
||||
print('\nTotal no.of samples with ambiguous muts:', len(inspect)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue