updating lineage_country.R with different data slices

This commit is contained in:
Tanushree Tunstall 2020-09-15 13:14:33 +01:00
parent 645827570f
commit 44d1f64e88

View file

@ -23,7 +23,7 @@ source("combining_dfs_plotting.R")
# Data for plot # Data for plot
######################### #########################
df = merged_df2 df = merged_df2
df = merged_df2_comp #df = merged_df2_comp
#======================== #========================
@ -32,10 +32,6 @@ df = merged_df2_comp
# col = Lineage # col = Lineage
# fill = lineage # fill = lineage
#======================== #========================
is.factor(df$lineage)
df$lineage = as.factor(df$lineage)
is.factor(df$lineage)
table(df$lineage) table(df$lineage)
# subset only lineages1-4 # subset only lineages1-4
@ -57,6 +53,35 @@ table(df_lin$lineage)
df <- df_lin df <- df_lin
#%%%%%%%%%%%%%%%%%%%%%%%%% #%%%%%%%%%%%%%%%%%%%%%%%%%
#%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
df2 = df
#%%%%%%%%%%%%%%%%%%%%%%%%
df2 = df2%>%
add_count(country_code)
str(df2$country_code); str(df2$n)
n = which(colnames(df2) == "n")
colnames(df2)[n] = "count_country"
table(df2$count_country>100 & df$country_code!= "")
df3 = subset(df2, df2$count_country>100 & df2$country_code != "")
#%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
df = df3
#%%%%%%%%%%%%%%%%%%%%%%%%
sample = sum(table(unique(df$id))); sample
table(df$country_code)
tab = sum(table(df$country_code)); tab
View(table(df$country_code))
View(t1)
############## begin plot ############## begin plot
g = ggplot(df, aes(x = lineage)) g = ggplot(df, aes(x = lineage))
g + geom_bar(aes(fill = lineage)) + g + geom_bar(aes(fill = lineage)) +
@ -112,7 +137,7 @@ g + geom_point(aes(col = lineage
### begin plot ### begin plot
table(df$lineage) table(df$lineage)
g = ggplot(df, aes(x = lineage g = ggplot(df_lin, aes(x = lineage
, y = duet_scaled)) , y = duet_scaled))
g + geom_point(aes(col = lineage g + geom_point(aes(col = lineage
, size = or_mychisq)) + , size = or_mychisq)) +
@ -131,60 +156,10 @@ g + geom_point(aes(col = lineage
, y = "DUET (PS)") , y = "DUET (PS)")
#======================== #========================
# Plot 4-6: Distributions # Plot 4-5: Distributions
# ggrdiges # ggrdiges
#======================== #========================
df$country_code_symbols = as.character(df$country_code)
foo = df
foo = foo%>%
add_count(country_code)
n = which(colnames(foo) == "n")
colnames(foo)[n] = "count_country"
table(foo$count_country)
check = as.data.frame(cbind(foo$country_code_symbols, foo$count_country))
str(check)
check$V2 = as.numeric(check$V2)
min(check$V2); max(check$V2)
table(check$V2)
check2 = subset(check, check$V2>100)
check2 = subset(check2, is.na(check2$V1))
min(check2$V2); max(check2$V2)
check3 = subset(check2, check2$V1 != "")
table(check3$V1 != "")
# subset df with country containing >100 samples
# and no missing
#%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
df2 = df
#%%%%%%%%%%%%%%%%%%%%%%%%
df2 = df2%>%
add_count(country_code)
str(df2$country_code); str(df2$n)
n = which(colnames(df2) == "n")
colnames(df2)[n] = "count_country"
table(df2$count_country>100 & df$country_code!= "")
df3 = subset(df2, df2$count_country>100 & df2$country_code != "")
#%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
df = df3
#%%%%%%%%%%%%%%%%%%%%%%%%
#================================================== #==================================================
my_ats = 15 # axis text size my_ats = 15 # axis text size