finally revised data processing is complete

This commit is contained in:
Tanushree Tunstall 2022-04-27 11:09:36 +01:00
parent ac0d14e116
commit 3c436f0c27

View file

@ -923,14 +923,14 @@ changes_val = []
changes_dict = {} changes_dict = {}
##BROKENNNN!!!! ##BROKENNNN!!!!
common_muts
gene_LF1['mutation'].head()
common_muts_lower = list((map(lambda x: x.lower(), common_muts))) common_muts_lower = list((map(lambda x: x.lower(), common_muts)))
common_muts_lower common_muts_lower
##BROKENNNN!!!! ##BROKENNNN!!!!
#for i in common_muts: for i in common_muts:
for i in common_muts_lower: #for i in common_muts_lower:
#print(i)
print(i)
temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']] temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']]
temp_df temp_df
# DANGER: ASSUMES TWO STATES ONLY and that value_counts sorts by descending # DANGER: ASSUMES TWO STATES ONLY and that value_counts sorts by descending
@ -992,7 +992,6 @@ print('\n----------------------------------'
ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True) ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True)
#%% FIXME: Add sanity check to make sure you can add value_count checks #%% FIXME: Add sanity check to make sure you can add value_count checks
print('\nREACHED here...................>>>>>>>')
#%% Resolving ambiguous muts #%% Resolving ambiguous muts
# Merging ambiguous muts # Merging ambiguous muts
#================= #=================
@ -1013,8 +1012,7 @@ else:
gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info'] = ambig_muts_rev_df['mutation_info_REV'] gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info'] = ambig_muts_rev_df['mutation_info_REV']
gene_LF1['mutation_info_orig'].value_counts() gene_LF1['mutation_info_orig'].value_counts()
#gene_LF1['mutation_info_v1'].value_counts() gene_LF1['mutation_info_v1'].value_counts()
foo = gene_LF1.iloc[ambig_muts_rev_df.index]
# Sanity check1: if there are still any ambiguous muts # Sanity check1: if there are still any ambiguous muts
#muts_split_rev = list(gene_LF1.groupby('mutation_info_v1')) #muts_split_rev = list(gene_LF1.groupby('mutation_info_v1'))
@ -1527,8 +1525,8 @@ if (gene_LF3['dst_mode'].value_counts().sum() == len(gene_LF3)) and (gene_LF3['d
else: else:
print('\nFAIL: revised dst mode numbers MISmatch') print('\nFAIL: revised dst mode numbers MISmatch')
foo = gene_LF3[['Mut', 'position', 'dst', 'dst_multimode', 'dst_noNA', 'dst_mode']] #foo = gene_LF3[['Mut', 'position', 'dst', 'dst_multimode', 'dst_noNA', 'dst_mode']]
foo2 = foo.sort_values(['position', 'Mut']) #foo2 = foo.sort_values(['position', 'Mut'])
print('\n------------------------------------------------------' print('\n------------------------------------------------------'
, '\nRevised counting: mutation_info i.e dm om column\n' , '\nRevised counting: mutation_info i.e dm om column\n'
@ -1727,7 +1725,7 @@ len(gene_LF3)
# Dropping duplicates from lineage df # Dropping duplicates from lineage df
lf_lin_split_dups = lf_lin_split_ColSel[lf_lin_split_ColSel.index.duplicated()] lf_lin_split_dups = lf_lin_split_ColSel[lf_lin_split_ColSel.index.duplicated()]
lf_lin_split_U = lf_lin_split_ColSel[~lf_lin_split_ColSel.index.duplicated()] lf_lin_split_U = lf_lin_split_ColSel[~lf_lin_split_ColSel.index.duplicated()]
if len(lf_lin_split_U) == len(snps_only): if len(lf_lin_split_U) == len(SnpFDict):
print('\nPASS: Duplicate entries removed from lf_lin' print('\nPASS: Duplicate entries removed from lf_lin'
, '\nReady to start the final merge to generate gene_LF4') , '\nReady to start the final merge to generate gene_LF4')
else: else: