A Set up

Getting data into correct format

Rachael Burke
2022-07-21

This document takes data and merges / sets it up the way we need it for the analysis. Most of the data is already on github at Dr MacPherson’s repo and package “mlwdata” (XXXXX).

We load most data from mlwdata, but there is an extra spreadsheet with some additional lab data that we read in from “data_sharing” folder in this (tbacf) repo.

Note that the end of this code puts .rds files in a “data” folder. “Data” is not included in this repo (but can be recreated locally through running this markdown file and using data from “mlwdata” and “extra_data” which is in this repo). Note that “here” in my version is actually heading one file division up (into the “main” repo and not “docs”, where this .Rmd is saved). You may need to fix the relative file paths to the “extra data” spreadsheet to make this work for you.

Load libraries, read in data and define some timepoints / labels

# Important libraries
library(tidyverse)
# library(devtools) # needed if you don't already have 'mlwdata' installed
# install.github("petermacp/mlwdata")
library(mlwdata)
library(lubridate)

# libraries needed in this code / to control aesthetics / preference that you could do without
library(formatR) # to wrap text in this markdown
library(here)

acf_start_date <- dmy("01 April 2011") 
acf_end_date <- dmy("01 Oct 2014")

diag_fct <- c("a) Clinically dx", "b) Smr/cult TB lab", "c) Xpert clinic", "d) Smr clinic", "e) Direct ACF") # factor levels for how TB was diagnosed
age_levels <- c("0-14","15-24","25-34","35-44","45-54","55-64","65+")

# This is the main data used in this analysis
cases_g <- mlwdata::blantyre_tb_cases_2009_2018 # this is grouped data, very sparse.  Contains pre-ACF data (2009-2010)
cases_l <- mlwdata::tb_cases_2011_2018 # this is long data from post ACF starting (2011-); this has more detail on each person with TB

# this is a dataset that contains CNR quarterly; but also contains population denominator 
# for ACF and non-ACF areas for adults 15+.  
# See documentation on PM's github, population denominators come from intrapolation from GoM census' (which were in 2008 and 2018).
pop <- mlwdata::acf_cnrs_overall %>%  
  mutate(yq=yq(year_q)) %>% 
  mutate(acfarea=case_when(acf=="ACF" ~ "b) ACF",
                           acf=="Non-ACF" ~ "a) Non-ACF")) %>%
  group_by(yq) %>%
  mutate(yq_num=cur_group_id()-1) %>% # this is the number of quarters elapsed since Q1.2009
  dplyr::select(yq, yq_num, acfarea, population)

# This is some additional data from TB lab and x01 form
extra_info <- readRDS(here("data_sharing","extra_info.rds")) %>% select(unique_id, l30cultures, l32id, new_mode)

Define some functions

# This is function to add in population denominators, group cases and recreate useful columns (will work on whole long dataset, or subsections of it)
add_denoms <- function(df){
  df %>%
  group_by(yq) %>%
    mutate(yq_num=cur_group_id()-1) %>% # yq num this is number of quarter years passed since January 2009 and is used for statistical modelling.
    left_join(pop, by=c("yq", "yq_num", "acfarea")) %>%
    mutate(cnr= (n/population) * 100000 * 4) %>%
    mutate(yq_mid = yq + days(45)) %>%
    mutate(acftime=case_when(yq_mid>acf_start_date & yq_mid<acf_end_date ~ "acf",
                           yq_mid<acf_start_date ~ "pre-acf",
                           yq_mid>acf_end_date ~ "post-acf")) %>%
    mutate(acftime=factor(acftime, levels=c("pre-acf", "acf", "post-acf")))
}

Wrangle the 2011-2018 data

Including adding in the “extra data” form lab that wasn’t in PM’s spreadsheet. Prospectively collected data.

# This adds the lab information (and the people who were diagnosed directly through ACF vs. indirectly)
tb_2011_2018 <- cases_l %>% 
  mutate(yq=yq(year_q)) %>%
  left_join(extra_info) %>% # the people not from Blantyre City *don't* get merged here as "left join" rather than full join.
  mutate(cult_lab=case_when(
  (l30cultures=="Pos" | l30cultures=="Scanty") & l32id=="MTB" ~ "Culture confirmed")) %>% 
  mutate(diagnosed=case_when(
    new_mode==3 ~ "e) Direct ACF",
    smr_clinic=="b) Smear-positive" ~ "d) Smr clinic",
    xpert_clinic=="b) Xpert-positive" ~ "c) Xpert clinic",
    cult_lab=="Culture confirmed" ~ "b) Smr/cult TB lab",
    TRUE ~ "a) Clinically dx")) %>% 
  mutate(facility = case_when(
    fac_code %in% c("BTQE") ~ "a) Central hospital",
    fac_code  %in% c("BTBG","BTCH","BTLB","BTND","BTSL","BTZG","BTGW")  ~ "b) Health centre",
    fac_code %in% c("BTBAH", "BTCH", "BTMB", "BTMW") ~ "c) Private health facility",
    TRUE ~ "d) Not recorded")) %>%
  mutate(hiv=case_when(is.na(hiv) ~ "c) Not recorded", TRUE ~ hiv)) %>%
  mutate(art=case_when(hiv=="b) HIV-positive" & is.na(art)==F ~ art,  
                       hiv=="b) HIV-positive" & is.na(art) ~ "c) Not recorded", # why does no-one have a missing ART status?
                       TRUE ~ NA_character_)) %>%
  mutate(ageg = cut(age, breaks=c(0,15,25,35,45,55,65,Inf), right=F, labels=age_levels)) %>%
  filter(ageg!="0-14") %>% # this is now just adults who reside in Blantyre City  (removed 1272 children from the dataset for analysis (see methods of paper for rationale - mainly that children not targetted by ACF at all))
  mutate(acfarea=case_when(acf=="ACF" ~ "b) ACF",
                           acf=="Non-ACF" ~ "a) Non-ACF")) 

Wrangle the 2009-2010 data

Retrospectively collected (grouped) data. Make this into a “long” dataset, using “uncount”.

# Use "uncount" to change grouped data into a one-observation-per-row data (have to create "intermediate" dfs for this, as I don't think I can "uncount" smr-neg and smr-pos seperately in one step)
int_a <- cases_g %>% mutate(yq=yq(year_q)) %>% filter(yq<=dmy("01 Jan 2011")) %>% pivot_wider(names_from=tbcases, values_from=n) %>% rename(micro_pos=`Smr/Xpert-positive cases`) %>% mutate(micro_neg=`All cases` - micro_pos)
int_b <- int_a %>% dplyr::select(yq, acf, micro_pos) %>% uncount(micro_pos) %>% mutate(smr="smr pos")
int_c <- int_a %>% dplyr::select(yq, acf, micro_neg) %>% uncount(micro_neg) %>% mutate(smr="smr neg / not done")

tb_2009_2010 <- int_b %>% full_join(int_c)  %>% 
  mutate(diagnosed=case_when(
    smr=="smr pos" ~ "d) Smr clinic", # there was no Xpert and no TB lab culture during 2009-2010
    TRUE ~ "a) Clinically dx"
  )) %>% mutate(diagnosed=factor(diagnosed, levels=diag_fct)) %>%
      mutate(acfarea=case_when(acf=="ACF" ~ "b) ACF",
                           acf=="Non-ACF" ~ "a) Non-ACF"))

Merge timeperiods and create “grouped” data for each quarter

Merge 2009-2010 to 2011 onwards, and label time periods.

tb_2009_2018 <- tb_2009_2010 %>% 
  full_join(tb_2011_2018) %>% 
  dplyr::select(yq, acfarea, diagnosed, acf) %>% 
  mutate(yq_mid = yq + days(45)) %>%
  mutate(acftime=case_when(yq_mid>acf_start_date & yq_mid<acf_end_date ~ "acf",
                           yq_mid<acf_start_date ~ "pre-acf",
                           yq_mid>acf_end_date ~ "post-acf")) %>%
  mutate(acftime=factor(acftime, levels=c("pre-acf", "acf", "post-acf")))

# check number of records correct
nrow(tb_2009_2010) + nrow(tb_2011_2018)
[1] 20119
nrow(tb_2009_2018) # same
[1] 20119

Then created “grouped” datasets, that contain the “case notification rates” (CNRs) rather than just number of cases. This is adult TB diagnoses per 100,000 adult population, by ACF area (“add_denoms” function specifed above under section “Start”).

# This is cnrs for each of four levels of how diagnosed
cnrs_d <- tb_2009_2018 %>% group_by(yq, acfarea, diagnosed) %>% summarise(n=n()) %>% 
  complete(.,yq, acfarea, diagnosed, fill = list(n=0)) %>% # levels that have no people, should exist in data, with "0" people
  ungroup() %>%
  add_denoms()

# This is all form
cnrs_all <- tb_2009_2018 %>% group_by(yq, acfarea) %>% summarise(n=n()) %>% 
  ungroup() %>%
  add_denoms()

# This is smear pos just from clinic (i.e excluding those subsequently micro confirmed in TB lab)
cnrs_smp_clinic <- tb_2009_2018 %>% filter(diagnosed=="c) Xpert clinic" | diagnosed=="d) Smr clinic" | diagnosed=="e) Direct ACF") %>% 
  group_by(yq, acfarea) %>% 
  summarise(n=n()) %>% 
  ungroup() %>%
  add_denoms()

Save these datasets (with a reminder of what they are).

saveRDS(tb_2009_2018, file=here("data","tb_2009_2018.rds")) # Each row represents one adult in Blantyre City diagnosed with TB, information on method of TB diagnosis, quarter-year of diagnosis and whether they lived in an ACF or non-ACF area.
saveRDS(tb_2011_2018, file=here("data","tb_2010_2018.rds")) # Each row represents one adult in Blantyre City diagnosed with TB, but with more information per person than the 2009_2010 file (i.e. includes HIV status, age and sex)
saveRDS(tb_2009_2010, file=here("data","tb_2009_2010.rds"))
saveRDS(pop, file=here("data","pop.rds")) # Blantyre City adult population by quarter-year and ACF vs. non-ACF area

saveRDS(cnrs_d, file=here("data","cnrs_d.rds")) # TB CNRs (adult TB diagnoses per 100,000 adults) by method of diagnosis
saveRDS(cnrs_all, file=here("data","cnrs_all.rds")) # All form TB CNRs
saveRDS(cnrs_smp_clinic, file=here("data","cnrs_smp_clinic.rds")) # CNR for smr positive / micro confirmed TB (at the time of starting TB treatment)