Functions for Epidemiological Analysis using Population Data

Enables computation of epidemiological statistics where e.g. counts or mortality rates of the reference population are used. Currently supported: excess hazard models, rates, mean survival times, relative survival, as well as standardized incidence and mortality ratios (SIRs/SMRs), all of which can be easily adjusted for e.g. age. Fast splitting and aggregation of 'Lexis' objects (from package 'Epi') and other computations achieved using 'data.table'.


The purpose of popEpi is to facilitate computing certain epidemiological statistics where population data is used. Current main attractions:

the lexpand function allows users to split their subject-level follow-up data into sub-intervals along age, follow-up time and calendar time, merge corresponding population hazard information to those intervals, and to aggregate the resulting data if needed.

data(sire)
sr <- sire[1,]
print(sr)
#> 1:   1 1952-05-27 1994-02-03 2012-12-31      0 41.68877
x <- lexpand(sr, birth = bi_date, entry = dg_date, exit = ex_date,
             status = status %in% 1:2, 
             fot = 0:5, per = 1994:2000)
print(x)
#>     lex.id      fot     per      age    lex.dur lex.Cst lex.Xst sex
#>  1:      1 0.000000 1994.09 41.68877 0.90958904       0       0   1
#>  2:      1 0.909589 1995.00 42.59836 0.09041096       0       0   1
#>  3:      1 1.000000 1995.09 42.68877 0.90958904       0       0   1
#>  4:      1 1.909589 1996.00 43.59836 0.09041096       0       0   1
#>  5:      1 2.000000 1996.09 43.68877 0.90958904       0       0   1
#>  6:      1 2.909589 1997.00 44.59836 0.09041096       0       0   1
#>  7:      1 3.000000 1997.09 44.68877 0.90958904       0       0   1
#>  8:      1 3.909589 1998.00 45.59836 0.09041096       0       0   1
#>  9:      1 4.000000 1998.09 45.68877 0.90958904       0       0   1
#> 10:      1 4.909589 1999.00 46.59836 0.09041096       0       0   1
#>        bi_date    dg_date    ex_date status   dg_age
#>  1: 1952-05-27 1994-02-03 2012-12-31      0 41.68877
#>  2: 1952-05-27 1994-02-03 2012-12-31      0 41.68877
#>  3: 1952-05-27 1994-02-03 2012-12-31      0 41.68877
#>  4: 1952-05-27 1994-02-03 2012-12-31      0 41.68877
#>  5: 1952-05-27 1994-02-03 2012-12-31      0 41.68877
#>  6: 1952-05-27 1994-02-03 2012-12-31      0 41.68877
#>  7: 1952-05-27 1994-02-03 2012-12-31      0 41.68877
#>  8: 1952-05-27 1994-02-03 2012-12-31      0 41.68877
#>  9: 1952-05-27 1994-02-03 2012-12-31      0 41.68877
#> 10: 1952-05-27 1994-02-03 2012-12-31      0 41.68877
data(popmort)
x <- lexpand(sr, birth = bi_date, entry = dg_date, exit = ex_date,
             status = status %in% 1:2, 
             fot = 0:5, per = 1994:2000, pophaz = popmort)
print(x)
#>     lex.id      fot     per      age    lex.dur lex.Cst lex.Xst sex
#>  1:      1 0.000000 1994.09 41.68877 0.90958904       0       0   1
#>  2:      1 0.909589 1995.00 42.59836 0.09041096       0       0   1
#>  3:      1 1.000000 1995.09 42.68877 0.90958904       0       0   1
#>  4:      1 1.909589 1996.00 43.59836 0.09041096       0       0   1
#>  5:      1 2.000000 1996.09 43.68877 0.90958904       0       0   1
#>  6:      1 2.909589 1997.00 44.59836 0.09041096       0       0   1
#>  7:      1 3.000000 1997.09 44.68877 0.90958904       0       0   1
#>  8:      1 3.909589 1998.00 45.59836 0.09041096       0       0   1
#>  9:      1 4.000000 1998.09 45.68877 0.90958904       0       0   1
#> 10:      1 4.909589 1999.00 46.59836 0.09041096       0       0   1
#>        bi_date    dg_date    ex_date status   dg_age     pop.haz       pp
#>  1: 1952-05-27 1994-02-03 2012-12-31      0 41.68877 0.001170685 1.000651
#>  2: 1952-05-27 1994-02-03 2012-12-31      0 41.68877 0.001441038 1.000651
#>  3: 1952-05-27 1994-02-03 2012-12-31      0 41.68877 0.001200721 1.001856
#>  4: 1952-05-27 1994-02-03 2012-12-31      0 41.68877 0.001300846 1.001856
#>  5: 1952-05-27 1994-02-03 2012-12-31      0 41.68877 0.001400981 1.003207
#>  6: 1952-05-27 1994-02-03 2012-12-31      0 41.68877 0.002142293 1.003207
#>  7: 1952-05-27 1994-02-03 2012-12-31      0 41.68877 0.002202424 1.005067
#>  8: 1952-05-27 1994-02-03 2012-12-31      0 41.68877 0.001771568 1.005067
#>  9: 1952-05-27 1994-02-03 2012-12-31      0 41.68877 0.002222468 1.007277
#> 10: 1952-05-27 1994-02-03 2012-12-31      0 41.68877 0.002282603 1.007277
a <- lexpand(sr, birth = bi_date, entry = dg_date, exit = ex_date,
             status = status %in% 1:2,
             fot = 0:5, per = 1994:2000, aggre = list(fot, per))
print(a)
#>     fot  per       pyrs at.risk from0to0
#>  1:   0 1994 0.90958904       0        0
#>  2:   0 1995 0.09041096       1        0
#>  3:   1 1995 0.90958904       0        0
#>  4:   1 1996 0.09041096       1        0
#>  5:   2 1996 0.90958904       0        0
#>  6:   2 1997 0.09041096       1        0
#>  7:   3 1997 0.90958904       0        0
#>  8:   3 1998 0.09041096       1        0
#>  9:   4 1998 0.90958904       0        0
#> 10:   4 1999 0.09041096       1        1

One can make use of the sir function to estimate indirectly standardised incidence or mortality ratios (SIRs/SMRs). The data can be aggregated by lexpand or by other means. While sir is simple and flexible in itself, one may also use sirspline to fit spline functions for the effect of e.g. age as a continuous variable on SIRs.

data(popmort)
data(sire)
c <- lexpand( sire, status = status %in% 1:2, birth = bi_date, exit = ex_date, entry = dg_date,
              breaks = list(per = 1950:2013, age = 1:100, fot = c(0,10,20,Inf)), 
              aggre = list(fot, agegroup = age, year = per, sex) )
#> dropped 16 rows where entry == exit
 
se <- sir( coh.data = c, coh.obs = 'from0to1', coh.pyrs = 'pyrs', 
           ref.data = popmort, ref.rate = 'haz', 
           adjust = c('agegroup', 'year', 'sex'), print = 'fot')
#> Confidence intervals calculated from profile-likelihood.
se
#> SIR Standardized by:  agegroup year sex
#> 
#>  Total observed: 4559 
#>  Total expected: 1482.13 
#>  Total person-years: 
#> 39905.92 
#> 
#> Poisson modelled SIR: 
#>    fot observed expected     pyrs  sir 2.5 % 97.5 % p_value
#> 1:   0     4264  1214.54 34445.96 3.51  3.41   3.62   0.000
#> 2:  10      295   267.59  5459.96 1.10  0.98   1.23   0.094
#> 
#> Test for homogeneity p < 0.001

The survtab function computes observed, net/relative and cause-specific survivals as well as cumulative incidence functions for Lexis data. Any of the supported survival time functions can be easily adjusted by any number of categorical variables if needed.

One can also use survtab_ag for aggregated data. This means the data does not have to be on the subject-level to compute survival time function estimates.

library(Epi)
#> 
#> Attaching package: 'Epi'
#> The following object is masked from 'package:base':
#> 
#>     merge.data.frame
 
data(sibr)
sire$cancer <- "rectal"
sibr$cancer <- "breast"
sr <- rbind(sire, sibr)
 
sr$cancer <- factor(sr$cancer)
sr <- sr[sr$dg_date < sr$ex_date, ]
 
sr$status <- factor(sr$status, levels = 0:2, 
                    labels = c("alive", "canD", "othD"))
 
x <- Lexis(entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), 
           exit = list(CAL = get.yrs(ex_date)), 
           data = sr,
           exit.status = status)
#> NOTE: entry.status has been set to "alive" for all.
 
st <- survtab(FUT ~ cancer, data = x,
              breaks = list(FUT = seq(0, 5, 1/12)),
              surv.type = "cif.obs")
st
#> 
#> Call: 
#>  survtab(formula = FUT ~ cancer, data = x, breaks = list(FUT = seq(0, 5, 1/12)), surv.type = "cif.obs") 
#> 
#> Type arguments: 
#>  surv.type: cif.obs --- surv.method: hazard
#>  
#> Confidence interval arguments: 
#>  level: 95 % --- transformation: log-log
#>  
#> Totals:
#>  person-time:62120 --- events: 5375
#>  
#> Stratified by: 'cancer'
#>    cancer Tstop surv.obs.lo surv.obs surv.obs.hi SE.surv.obs CIF_canD
#> 1: breast   2.5      0.8804   0.8870      0.8933    0.003290   0.0687
#> 2: breast   5.0      0.7899   0.7986      0.8070    0.004368   0.1162
#> 3: rectal   2.5      0.6250   0.6359      0.6465    0.005480   0.2981
#> 4: rectal   5.0      0.5032   0.5148      0.5263    0.005901   0.3727
#>    CIF_othD
#> 1:   0.0442
#> 2:   0.0852
#> 3:   0.0660
#> 4:   0.1125

News

Changes in 0.4.1

  • lexpand() bug fixed (#120): observations were dropped if their entry by age was smaller than the smallest age value, though entry at exit is correct and used now.
  • sir() rewrite (#118, #122). New more consistent output, updates on plotting and minor changes in arguments. Introduce very simple coef() and confint() methods for sir class.
  • new functions in sir family: sir_ag(), sir_lex() and sir_exp() for extracting SMRs from aggre and Lexis objects.
  • fixed issue in internal test brought by pkg survival version 2.39.5; no changes in functions were needed (#125)
  • robustified aggre(); there were issues with Epi pkg dev version which are definitely avoided (#119)

Changes in 0.4.0

  • removed previously deprecated shift.var (#35)
  • popEpi no longer depends on package data.table but imports it - this means the user will have to do library(data.table) separately to make data.table's functions become usable. Formerly popEpi effectively did library(data.table) when you did library(popEpi).
  • summary.survtab: args t and q behaviour changed
  • survtab: internal weights now based on counts of subjects in follow-up at the start of follow-up (used to be sum of counts/pyrs over all of follow-up)
  • new functions: rate_ratio(), sir_ratio()
  • small internal changes in preparation for data.table 1.9.8

Changes in 0.3.1

This is a hotfix. survtab() was causing warnings in certain situations, which this update fixes. Also fixed plotting survtab objects so that multiple strata are plotted correctly when one or more curves end before the longest one as well other small fixes: See Github issues #89, #90, #91, and #92.

Changes in 0.3.0

Direct adjusting (computing weighted averages of estimates) has been generalized. Functions such as survtab and survmean allow for using adjust() mini function within formulas, or a separate adjust argument. Weights are passed separately. See the examples in the next chapter. See also ?direct_adjusting.

The survtab function computes observed, net/relative and cause-specific survivals as well as cumulative incidence functions for Lexis data. Any of the supported survival time functions can be easily adjusted by any number of categorical variables if needed.

One can also use survtab_ag for aggregated data. This means the data does not have to be on the subject-level to compute survival time function estimates.

## prep data
data(sibr)
sire$cancer <- "rectal"
sibr$cancer <- "breast"
sr <- rbind(sire, sibr)
 
sr$cancer <- factor(sr$cancer)
sr <- sr[sr$dg_date < sr$ex_date, ]
 
sr$status <- factor(sr$status, levels = 0:2, 
                    labels = c("alive", "canD", "othD"))
 
## create Lexis object
library(Epi)
#> 
#> Attaching package: 'Epi'
#> The following object is masked from 'package:popEpi':
#> 
#>     [.Lexis
#> The following object is masked from 'package:base':
#> 
#>     merge.data.frame
x <- Lexis(entry = list(FUT = 0, AGE = dg_age, CAL = get.yrs(dg_date)), 
           exit = list(CAL = get.yrs(ex_date)), 
           data = sr,
           exit.status = status)
#> NOTE: entry.status has been set to "alive" for all.
 
## population hazards file - see ?pophaz for general instructions
data(popmort)
pm <- data.frame(popmort)
names(pm) <- c("sex", "CAL", "AGE", "haz")
 
## simple usage - uses lex.Xst as status variable
st <- survtab(FUT ~ cancer, data = x,
              breaks = list(FUT = seq(0, 5, 1/12)),
              surv.type = "surv.rel", pophaz = pm)
 
## more explicit usage
library(survival)
st <- survtab(Surv(FUT, event = lex.Xst) ~ cancer, data = x,
              breaks = list(FUT = seq(0, 5, 1/12)),
              surv.type = "surv.rel", pophaz = pm)
 
 
## adjusting
x$agegr <- cut(x$dg_age, c(0,55,65,75,Inf))
w <- as.numeric(table(x$agegr))
st <- survtab(Surv(FUT, event = lex.Xst) ~ cancer + adjust(agegr), 
              data = x,
              breaks = list(FUT = seq(0, 5, 1/12)),
              surv.type = "surv.rel", 
              pophaz = pm, weights = w)

The new rate function enables easy calculation of e.g. standardized incidence rates:

## dummy data
 
a <- merge(0:1, 1:18)
names(a) <- c("sex", "agegroup")
set.seed(1)
a$obs <- rbinom(nrow(a), 100, 0.5)
set.seed(1)
a$pyrs <- rbinom(nrow(a), 1e4, 0.75)
 
## so called "world" standard rates (weighted to hypothetical world pop in 2000)
r <- rate(data = a, obs = obs, pyrs = pyrs, print = sex, 
          adjust = agegroup, weights = 'world_2000_18of5')
#> Warning in pyrDSMNspOBEl * pyrDSMNspOBEl: NAs produced by integer overflow
 
#> Warning in pyrDSMNspOBEl * pyrDSMNspOBEl: NAs produced by integer overflow
sexobspyrsrate.adjSE.rate.adjrate.adj.lorate.adj.hirateSE.raterate.lorate.hi
09331349860.00699470.00025410.00651400.00751080.0069118NA0.00648220.0073699
18751348490.00644530.00024290.00598650.00693940.0064887NA0.00607270.0069332

Reference manual

It appears you don't have a PDF plugin for this browser. You can click here to download the reference manual.

install.packages("popEpi")

0.4.1 by Joonas Miettinen, 3 months ago


https://github.com/WetRobot/popEpi


Report a bug at https://github.com/WetRobot/popEpi/issues


Browse source code at https://github.com/cran/popEpi


Authors: Joonas Miettinen [aut, cre], Matti Rantanen [aut], Karri Seppa [ctb]


Documentation:   PDF Manual  


Task views: Survival Analysis


GPL-3 license


Imports Epi, survival, data.table

Suggests reshape2, methods, testthat, knitr, rmarkdown, relsurv, ggplot2, mstate, date, splines, roxygen2


See at CRAN