diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py index b064ec1..6fd8ca6 100755 --- a/scripts/data_extraction.py +++ b/scripts/data_extraction.py @@ -923,14 +923,14 @@ changes_val = [] changes_dict = {} ##BROKENNNN!!!! - +common_muts +gene_LF1['mutation'].head() common_muts_lower = list((map(lambda x: x.lower(), common_muts))) common_muts_lower ##BROKENNNN!!!! -#for i in common_muts: -for i in common_muts_lower: - - print(i) +for i in common_muts: +#for i in common_muts_lower: + #print(i) temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']] temp_df # DANGER: ASSUMES TWO STATES ONLY and that value_counts sorts by descending @@ -992,7 +992,6 @@ print('\n----------------------------------' ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True) #%% FIXME: Add sanity check to make sure you can add value_count checks -print('\nREACHED here...................>>>>>>>') #%% Resolving ambiguous muts # Merging ambiguous muts #================= @@ -1013,8 +1012,7 @@ else: gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info'] = ambig_muts_rev_df['mutation_info_REV'] gene_LF1['mutation_info_orig'].value_counts() -#gene_LF1['mutation_info_v1'].value_counts() -foo = gene_LF1.iloc[ambig_muts_rev_df.index] +gene_LF1['mutation_info_v1'].value_counts() # Sanity check1: if there are still any ambiguous muts #muts_split_rev = list(gene_LF1.groupby('mutation_info_v1')) @@ -1527,8 +1525,8 @@ if (gene_LF3['dst_mode'].value_counts().sum() == len(gene_LF3)) and (gene_LF3['d else: print('\nFAIL: revised dst mode numbers MISmatch') -foo = gene_LF3[['Mut', 'position', 'dst', 'dst_multimode', 'dst_noNA', 'dst_mode']] -foo2 = foo.sort_values(['position', 'Mut']) +#foo = gene_LF3[['Mut', 'position', 'dst', 'dst_multimode', 'dst_noNA', 'dst_mode']] +#foo2 = foo.sort_values(['position', 'Mut']) print('\n------------------------------------------------------' , '\nRevised counting: mutation_info i.e dm om column\n' @@ -1727,7 +1725,7 @@ len(gene_LF3) # Dropping duplicates from lineage df lf_lin_split_dups = lf_lin_split_ColSel[lf_lin_split_ColSel.index.duplicated()] lf_lin_split_U = lf_lin_split_ColSel[~lf_lin_split_ColSel.index.duplicated()] -if len(lf_lin_split_U) == len(snps_only): +if len(lf_lin_split_U) == len(SnpFDict): print('\nPASS: Duplicate entries removed from lf_lin' , '\nReady to start the final merge to generate gene_LF4') else: