63  Scatter Plot

Code
dat <- foreign::read.dta("C:/Dataset/bea_organ_damage_28122013.dta")

dataF <- readstata13::read.dta13("C:/Dataset/olivia_data_wide.dta")

df1 <- read.csv("C:\\Users\\Sbngu\\Dropbox\\Data for book\\booking1.csv")

SC <- 
  dat %>% 
  select(q12weight, q13waist, q3sex, q10what) %>% 
  drop_na() %>%
  ggplot(aes(x = q12weight, y = q13waist, color = q3sex))
Code
SC + 
    geom_point(shape = "diamond", size = 2) +
    labs(
        x = "Weight (kgs)", 
        color = "Gender",
        y = "Waist Circumference (cms)") +
    geom_smooth(method = "lm", formula = "y ~ x") +
    theme(
        axis.title.x = element_text(
            vjust = 0, 
            size = 14, 
            color = "blue", 
            face = "italic"),
        axis.title.y = element_text(
          vjust = 2, 
          size = 14, 
          color = "firebrick", 
          face = "bold"),
        axis.text = element_text(
          color = "dodgerblue", size = 12),
        axis.text.x = element_text(face = "italic"))
Figure 63.1: Relationship between weight and waist circunference
Code
SC +    
  geom_point(aes(shape = q3sex)) + 
    geom_smooth(method=lm, formula = y~x, se=F) + 
    theme_bw(base_family = "serif") +
    labs(
        x = "Weight (kgs)", 
        y = "Waist Circumference (cms)") + 
    theme(
        plot.title=element_text(size=15, face="bold"), 
        axis.text.x=element_text(size=12), 
        axis.text.y=element_text(size=12),
        axis.title.x=element_text(size=13),
        axis.title.y=element_text(size=13), 
        plot.background = element_rect(fill = "grey90"),
        panel.background = element_rect(fill = "snow1")) +
    scale_color_discrete(name="Sex") +
    scale_shape_discrete(name="Sex")
Figure 63.2: Relationship between weight and waist circunference
Code
SC +    
    geom_point() + 
    geom_smooth(method=loess, se=TRUE, formula = y~x) + 
    theme_classic() +
    labs(
        x = "Weight (kgs)", 
        y = "Waist Circumference (cms)") + 
    theme(
        plot.title=element_text(size=15, face="bold"), 
        axis.text.x=element_text(size=12), 
        axis.text.y=element_text(size=12),
        axis.title.x=element_text(size=13),
        axis.title.y=element_text(size=13)) + 
  scale_color_discrete(name="Sex")
Figure 63.3: Relationship between weight and waist circunference
Code
SC +    
  geom_point() + 
    geom_smooth(method=lm, formula = y ~ x, se=F) + 
    theme_minimal() +
    labs(
        x = "Weight (kgs)", 
        y = "Waist Circumference (cms)") + 
    theme(
        plot.title=element_text(size=15, face="bold"), 
        axis.text.x=element_text(size=12), 
        axis.text.y=element_text(size=12),
        axis.title.x=element_text(size=13),
        axis.title.y=element_text(size=13)) + 
    scale_color_discrete(name = "Sex") + 
    facet_wrap(~q10what)
Figure 63.4: Relationship between weight and waist circunference
Code
SC + 
  geom_point(color = "firebrick") +
  labs(
      x = "Weight (kgs)", 
      y = "Waist Circumference (cms)") +
  theme(axis.ticks.y = element_blank(),
        axis.text.y = element_blank())
Figure 63.5: Relationship between weight and waist circunference

Limit Axes Range

Code
df1 %>% 
    drop_na(weight, height) %>% 
  ggplot(aes(x = weight, y = height)) +
    geom_point(aes(color = sex, shape = sex)) + 
    geom_smooth(method = "lm", formula = y ~ x, se = T) + 
    labs(x = "Weight (kgs)", y = "Height (cm)") +
    theme_light() +
    theme(
        plot.title=element_text(size = 16, face = "bold"), 
        axis.text.x=element_text(size = 12), 
        axis.text.y=element_text(size = 12),
        axis.title.x=element_text(size = 12, face = "bold"),
        axis.title.y=element_text(size = 12, face = "bold")) + 
    scale_color_discrete(name="Gender") + 
    scale_shape_discrete(name="Gender")
Figure 63.6: Plot of Weight versus height
Code
dataF %>% 
    mutate(
        mcv_cat_1 = case_when(
            mcv1 < 80 ~ "Microcyte",
            mcv1 >= 80 & mcv1 <= 90 ~ "Normal",
            mcv1 > 90 ~ "Macrocyte") %>% 
            factor(levels = c("Microcyte", "Normal", "Macrocyte"))) %>% 
    ggplot(aes(x = hb1, y = hb2)) +
    geom_point(aes(size = mcv_cat_1, col = mcv_cat_1), alpha = .5) +
    geom_smooth(formula = y ~ x, method = "lm", color = "black") +
    geom_vline(
        xintercept = 10, 
        color = "red", 
        linewidth = 0.5, 
        linetype = "dashed") +
    geom_hline(
        yintercept = 10, 
        color = "red", 
        linewidth = 0.5, 
        linetype = "dashed") +
    geom_abline(
        intercept = 0, 
        slope = 1, 
        color = "brown", 
        linewidth = 0.5, 
        linetype = "dashed") +
    labs(x = "First Hemoglobin", y = "Second Hemoglobin") +
    theme_light()
Figure 63.7: Relationship between first and second platelet counts showing possible outliers
Code
dataF.2 <- 
    tibble(
    lbls = c("Kofi","Ama", "Yaw","Sammy", "Abena"),
    hgts = c(176, 154, 136, 144, 165),
    wgts = c(65, 76,48,77, 65))

dataF.2 %>% 
    ggplot(aes(x = hgts, y = wgts))+
    geom_point() +
    annotate(
        "text", 
        x = dataF.2$hgts, 
        y = dataF.2$wgts, 
        label = dataF.2$lbls, 
        vjust = 1, col=1:5) +
    labs(
        title = "Height vrs Weight", 
        subtitle = "Yes we can", 
        caption = "2020 Data")+
    ggthemes::theme_gdocs() +
    scale_y_continuous(
        name = "Weight (kgs)", 
        limits = c(40, 80), 
        breaks = c(45, 50, 55, 60, 65, 70, 75)) +
    scale_x_continuous(
        name = "Height (kgs)", 
        limits = c(130, 200), 
        breaks = seq(130, 190, 10)) +
    geom_label(aes(label = lbls), col = "grey", nudge_y = 3)
Figure 63.8: Height v. Weight
Code
dataG <- 
    dataF %>% 
    mutate(is_outlier = (plt1<50 | plt2<100 | plt2>400 | plt1>400)) 

dataG %>% 
    ggplot(aes(x = plt1, y = plt2, col = is_outlier)) + 
    geom_point() +
    labs(
        x = "First Platelet Count",
        y = "Second Platelet Count",
        title = "Relationship between first and second platelet counts showing possible outliers")+
    theme_bw()+
    ggrepel::geom_label_repel(
        data = filter(dataG, is_outlier == TRUE), 
        aes(label=id)) +
    theme(legend.position="none")
Figure 63.9: Relationship between first and second platelet counts showing possible outliers
Code
p1 <-
    dataF %>% 
    ggplot(aes(x = hb1, y = hb2))+
    geom_point(col = "maroon") +
    geom_smooth(formula = y~x, method = "lm") +
    theme_bw() +
    labs(
        x = "First HgB measurement (g/dL)",
        y = "Second HgB measurement (g/dL)")

p2 <- 
    dataF %>% 
    ggplot() +
    geom_histogram(aes(x=hb1),bins = 12, col="black", fill = "grey") +
    labs(x=NULL, y=NULL) +
    theme_void() +
    theme(
        axis.ticks.y = element_blank(),
        axis.ticks.x  = element_blank(),
        axis.line.x = element_blank(),
        axis.line.y = element_blank(),
        axis.text.x = element_blank(),
        axis.text.y = element_blank())

p3 <- 
    dataF %>% 
    ggplot() +
    geom_histogram(aes(x=hb2),bins = 12, col="black", fill = "grey") +
    coord_flip()+
    labs(x=NULL, y=NULL) +
    theme_void() +
    theme(
        axis.ticks.y = element_blank(),
          axis.ticks.x  = element_blank(),
          axis.line.x = element_blank(),
          axis.line.y = element_blank(),
          axis.text.x = element_blank(),
          axis.text.y = element_blank())

col1 <- (p2/p1) + plot_layout(heights = c(1,4))
col2 <- (plot_spacer()/p3) + plot_layout(heights = c(1,4))
(col1 | col2) +  
    plot_layout(widths = c(5,1)) +
    plot_annotation(caption = "Source: 2021 Data")
Figure 63.10: My special scatterplot with histograms of first and secon HgB
Code
p1 <-
    dataF %>% 
    ggplot(aes(x=hb1, y = hb2, col = fpreg)) +
    geom_point() +
    geom_density_2d(color = "blue")

p2 <-
    dataF %>% 
    ggplot(aes(x=hb1, y = hb3, col = fpreg)) +
    geom_point() +
    geom_density_2d(color = "blue")

(p1 + p2) +
    plot_annotation(
        title = "My special title is here",
        subtitle = "Yes it is here",
        caption = "Why not!",
        theme = theme(
            plot.title = element_text(family ="serif", colour = "red"),
            plot.subtitle = element_text(
                family = "serif", color = "red", face = "italic")),
        tag_levels = "A") + 
    plot_layout(widths = c(1, 2),guides = "collect")
Figure 63.11: Combining plots
Code
dataF %>% 
    mutate(hct3 = ifelse(hct3 < 20, hct3 +40, hct3),
           hct3 = ifelse(hct3 > 60, hct3 - 20, hct3)) %>% 
    ggplot(aes(x = hct3, y = hb3)) +
    geom_point(color = "grey45") +
    geom_smooth(aes(x = hct3, y = hb3, col =  "Observed"), 
                formula = y~x, method = "lm", se = F) + 
    geom_segment(aes(x = min(hct3), y = min(hct3/3), 
                     xend = max(hct3), yend = max(hct3/3), 
                     col =  "Expected"))+
    labs(title = "Relationship between the third HB and HCT measurements",
         subtitle = "Comparison of observed  and expected regression line if HCT = 3*HB",
         x = "Hematocrit (%)", y = "Hemoglobin (mg/dl)", 
         color = "Regression Line") +
    theme_classic()+
    theme(plot.title = element_text(face = "bold"),
          plot.subtitle = element_text(face = "italic"))
Warning in geom_segment(aes(x = min(hct3), y = min(hct3/3), xend = max(hct3), : All aesthetics have length 1, but the data has 350 rows.
ℹ Please consider using `annotate()` or provide this layer with data containing
  a single row.

Code
dataF %>% 
    ggplot(size = 0.5) +
    geom_point(aes(hb3, mcv3, color = mcv1, size = occup, shape = educ)) + 
    guides(color = guide_colorbar(title = "First MCV"),
           shape = guide_legend(title = "Educational level"), 
           size = guide_legend(title = "Occupation"))
Warning: Using size for a discrete variable is not advised.

Code
dataF %>%
    mutate(mari = fct_collapse(mari, 
                             "Married" = c("Married","Cohabiting"),
                             "Single" = c("Widowed", "Divorced"))) %>% 
    ggplot()+
    geom_point(aes(x = avehb, y = avehct, color = mari), show.legend = F) +
    geom_smooth(aes(x = avehb, y = avehct), se=F, formula = y~x, 
                method = "lm", size = 1, alpha = .5, col = "grey")+
    facet_wrap(~mari, nrow = 2, strip.position = "left") +
    labs(y = "Hematocrit (%)", x = "Hemoglobin (g/dl)", 
         title = str_glue("Relationship between Blood hemoglobin and ",
                          "Hematocrit stratified by marital status"))+
    theme(panel.background = element_blank(),
          panel.grid = element_blank(),
          axis.line = element_line(),
          strip.placement = "outside",
          strip.background = element_rect(fill = "#c1d3fe", color = "black"),
          strip.text = element_text(size = 10, face = "bold"),
          text = element_text(family = "serif"),
          axis.text = element_text(size = 10, face = "bold"),
          axis.title = element_text(size = 10, face = "bold.italic"),
          plot.title = element_text(face = "bold"))
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

Code
dataF %>% 
    ggplot(aes(x = mcv1, y = mch1, size = mcv1)) +
    geom_point(shape = 23, aes(fill = agecat), alpha =.2) +
    scale_x_continuous(name = "Initial Mean Corpuscular Volume",
                       breaks = seq(50, 130, 10),
                       labels = c("50.0","60.0","70.0", "80.0", "90.0", "100.0", 
                                  "110.0","120.0", "130.0"), 
                       position = "top") +
    scale_y_continuous(name = "Mean Corpuscular Hemoglobin", 
                       limits = c(15,45),
                       breaks = c(15, 30, 45), 
                       labels = c("15.00","30.00","45.00"),
                       position = "right") +
    scale_fill_manual(name = "Age Category", 
                      values = c("blue", "red", "green", "brown")) +
    scale_size_continuous(name = "MCV",
                          range = c(1,4), 
                          limits = c(50, 140), 
                          breaks = c(80, 120, 140),
                          labels = c("Microcyte", "Normocyte", "Macrocyte"))

Code
agecat_label <- 
    c("Age: 10-19 years", "Age: 20-29 years",
      "Age: 30-39 years","Age: 40-49 years")

names(agecat_label) <- c("10-19", "20-29", "30-39", "50-59")

dataF %>%
    ggplot(aes(hb3, mcv3), size = 0.5) +
    geom_point() +
    geom_smooth(method = "lm", formula = y~x) +
    labs(
        title = "Relationship between hemoglobin and mean corpuscular volume",
        x =  "Hemoglobin (mg/dl)",
        y = "Mean Corpuscular Volume (fl)")+
    theme_bw()+
    facet_wrap(facets = .~agecat, labeller = labeller(agecat = agecat_label))+
    theme(
        text = element_text(family = "serif"),
        strip.text = element_text(face = "bold"),
        strip.background = element_rect(fill = "white"),
        plot.title = element_text(face = 'bold'))

Code
dataF %>% 
    ggplot(aes(hb3, mcv3), size = 0.5) +
    geom_point() +
    geom_smooth(method = "lm", formula = y~x) +
    labs(
        title = "Relationship between hemoglobin and mean corpuscular volume",
        x =  "Hemoglobin (mg/dl)",
        y = "Mean Corpuscular Volume (fl)")+
    theme_bw() +
    facet_grid(occup ~ agecat, labeller = labeller(agecat = agecat_label))
Warning in qt((1 - level)/2, df): NaNs produced
Warning in qt((1 - level)/2, df): NaNs produced
Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
-Inf
Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
-Inf

Code
dataF %>% 
    ggplot(aes(hb3, mcv3), size = 0.5) +
    geom_point() +
    geom_smooth(method = "lm", formula = y~x) +
    labs(
        title = "Relationship between hemoglobin and mean corpuscular volume",
        x =  "Hemoglobin (mg/dl)",
        y = "Mean Corpuscular Volume (fl)")+
    theme_bw() +
    facet_wrap(c("occup", "agecat"), nrow = 3, labeller = labeller(agecat = agecat_label))
Warning in qt((1 - level)/2, df): NaNs produced
Warning in qt((1 - level)/2, df): NaNs produced
Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
-Inf
Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
-Inf

Code
dataLM <- dataF %>% select(hct4, hb4)

lm(hb4 ~ hct4, data = dataLM) %>% 
    predict(interval = "predict") %>% 
    as_tibble() %>% 
    bind_cols(dataLM) %>% 
    ggplot(aes(x = hct4, y = hb4)) +
    geom_point() +
    geom_smooth(method = "lm", formula = y~x, se=T)+
    geom_line(aes(y = lwr), col = "coral2", linetype = "dashed") +
    geom_line(aes(y = upr), col = "coral2", linetype = "dashed") +
    labs(title = "Relationship between HB4 and HCT4 with fillted line, prediction and se intervals",
         x = "HCT 4 (%)", y = "HB 4 (mg/dl)", caption = "Nurse Data 2015")+
    theme_bw()
Warning in predict.lm(., interval = "predict"): predictions on current data refer to _future_ responses

Code
dataH <- 
    readxl::read_xlsx(
        "C:/Dataset/Red cell indices against ferritin.xlsx"
        ) %>% 
    mutate(
        lg.fer = log(Ferritin), 
        MCH = ifelse(is.na(MCH), median(MCH, na.rm=T), MCH)
        )

preds <- 
    rbind(
        predict(lm(lg.fer ~ RBC, data = dataH), interval = "prediction"),
        predict(lm(lg.fer ~ HGB, data = dataH), interval = "prediction"), 
        predict(lm(lg.fer ~ HCT, data = dataH), interval = "prediction"),
        predict(lm(lg.fer ~ MCV, data = dataH), interval = "prediction"), 
        predict(lm(lg.fer ~ MCH, data = dataH), interval = "prediction")
        ) %>% 
    as_tibble()
Warning in predict.lm(lm(lg.fer ~ RBC, data = dataH), interval = "prediction"): predictions on current data refer to _future_ responses
Warning in predict.lm(lm(lg.fer ~ HGB, data = dataH), interval = "prediction"): predictions on current data refer to _future_ responses
Warning in predict.lm(lm(lg.fer ~ HCT, data = dataH), interval = "prediction"): predictions on current data refer to _future_ responses
Warning in predict.lm(lm(lg.fer ~ MCV, data = dataH), interval = "prediction"): predictions on current data refer to _future_ responses
Warning in predict.lm(lm(lg.fer ~ MCH, data = dataH), interval = "prediction"): predictions on current data refer to _future_ responses
Code
dataH %>% 
    pivot_longer(cols=RBC:MCH, names_to = "bld.ind") %>% 
    mutate(
        bld.ind = factor(bld.ind, levels = c("RBC", "HGB", "HCT", "MCV", "MCH"))
        ) %>% 
    arrange(bld.ind) %>% 
    bind_cols(preds) %>% 
    ggplot(aes(x = value)) + 
    geom_point(aes(y = lg.fer)) +
    geom_smooth(aes(y = lg.fer), se=T, method = "lm", formula = y~x) +
    geom_line(aes(y = upr), col = "red", linetype = "dashed") +
    geom_line(aes(y = lwr), col = "red", linetype = "dashed") +
    facet_wrap(vars(bld.ind), nrow = 2, scales = "free") +
    labs(
        title = "Blood indices with prediction lines (red), regression line (blue) and regression error",
        y = "Log of serum ferritin concentration",
        x = NULL)