# install.packages("remotes")
remotes::install_github("OHDSI/Capr")
install.packages("CDMConnector")Create OHDSI cohorts with R
Capr provides a language for expressing standard OHDSI Cohort definitions in R code.
Capr is particularly helpful when creating a large number of similar cohorts.
OHDSI defines a cohort as “a set of persons who satisfy one or more inclusion criteria for a duration of time”.
Learn more about the OHDSI approach to cohort building in the cohorts chapter of the Book of OHDSI.
Note that the negation of an “exclusion criteria” is an “inclusion criteria”.
For example: How would we restate these “exclusion criteria” as “inclusion criteria”?
Exclude if there are > 0 occurrences of Cystic Fibrosis prior to index
Include if there are exactly 0 occurrences of Cystic Fibrosis prior to index
We’ll write all our exclusion criteria as inclusion criteria to keep from getting confused.
Install Capr from Github and CDMConnector from CRAN
# install.packages("remotes")
remotes::install_github("OHDSI/Capr")
install.packages("CDMConnector")A fundamental building block to cohorts are concept sets.
These are like code sets but fancier.

Concept sets use OHDSI standard concept IDs (integers) instead of character strings.
library(DatabaseConnector)
library(Capr)
connectionDetails <- Eunomia::getEunomiaConnectionDetails()
connection <- connect(connectionDetails)
querySql(connection, "select * from concept limit 6") CONCEPT_ID CONCEPT_NAME DOMAIN_ID
1 35208414 Gastrointestinal hemorrhage, unspecified Condition
2 1118088 celecoxib 200 MG Oral Capsule [Celebrex] Drug
3 40213201 pneumococcal polysaccharide vaccine, 23 valent Drug
4 1557272 Alendronate Drug
5 4336464 Coronary artery bypass graft Procedure
6 4295880 Intramuscular injection Procedure
VOCABULARY_ID CONCEPT_CLASS_ID STANDARD_CONCEPT CONCEPT_CODE
1 ICD10CM 4-char billing code <NA> K92.2
2 RxNorm Branded Drug S 213469
3 CVX CVX S 33
4 RxNorm Ingredient S 46041
5 SNOMED Procedure S 232717009
6 SNOMED Procedure S 76601001
VALID_START_DATE VALID_END_DATE INVALID_REASON
1 2007-01-01 2099-12-31 <NA>
2 1970-01-01 2099-12-31 <NA>
3 2008-12-01 2099-12-31 <NA>
4 1970-01-01 2099-12-31 <NA>
5 1970-01-01 2099-12-31 <NA>
6 1970-01-01 2099-12-31 <NA>
Gastrointestinal bleeding concept is 192671
gibleed <- cs(descendants(192671))
gibleed <- getConceptSetDetails(gibleed, connection, "main")
gibleed── <Capr Concept Set> ─────────────────────────────────────────────────────────
# A tibble: 1 × 9
conceptId conceptCode conceptName domainId vocabularyId standardConcept
<int> <chr> <chr> <chr> <chr> <chr>
1 192671 74474003 Gastrointestinal … Conditi… SNOMED S
# ℹ 3 more variables: includeDescendants <lgl>, isExcluded <lgl>,
# includeMapped <lgl>
as.json(gibleed){
"items": [
{
"concept": {
"CONCEPT_ID": 192671,
"CONCEPT_NAME": "Gastrointestinal hemorrhage",
"STANDARD_CONCEPT": "S",
"STANDARD_CONCEPT_CAPTION": "Standard",
"INVALID_REASON": "V",
"INVALID_REASON_CAPTION": "Valid",
"CONCEPT_CODE": "74474003",
"DOMAIN_ID": "Condition",
"VOCABULARY_ID": "SNOMED",
"CONCEPT_CLASS_ID": "Clinical Finding"
},
"isExcluded": false,
"includeDescendants": true,
"includeMapped": false
}
]
}
Let’s create four cohorts. Three drug cohorts and one condition cohort.
gibleed <- cs(descendants(192671))
celecoxib <- cs(descendants(1118084))
diclofenac <- cs(descendants(1124300))
nsaids <- cs(descendants(c(1118084, 1124300)))# create concept set
gibleed <- cs(descendants(192671))
celecoxib <- cs(descendants(1118084))
diclofenac <- cs(descendants(1124300))
nsaids <- cs(descendants(c(1118084, 1124300)))
gibleedCohort <- cohort(entry = entry(condition(gibleed)))
celecoxibCohort <- cohort(entry = entry(drug(celecoxib)))
diclofenacCohort <- cohort(entry = entry(drug(diclofenac)))
nsaidsCohort <- cohort(entry = entry(drug(nsaids)))First we need to create a “Cohort Set” object
library(dplyr, warn.conflicts = FALSE)
cohortsToCreate <- tibble(
cohortId = 1L:4L,
cohortName = c("gibleed", "celecoxib", "diclofenac", "nsaids"),
caprCohort = list(gibleedCohort, celecoxibCohort, diclofenacCohort, nsaidsCohort)) %>%
mutate(json = purrr::map_chr(caprCohort, as.json)) %>%
mutate(expression = purrr::map(json, CirceR::cohortExpressionFromJson)) %>%
mutate(sql = purrr::map_chr(expression,
CirceR::buildCohortQuery,
options = CirceR::createGenerateOptions(generateStats = FALSE))) %>%
select("cohortId", "cohortName", "sql", "json") %>%
mutate(cohortId = bit64::as.integer64(cohortId))
CohortGenerator::isCohortDefinitionSet(cohortsToCreate)[1] TRUE
cohortsToCreate# A tibble: 4 × 4
cohortId cohortName sql json
<int64> <chr> <chr> <chr>
1 1 gibleed "CREATE TABLE #Codesets (\r\n codeset_id int NOT N… "{\n…
2 2 celecoxib "CREATE TABLE #Codesets (\r\n codeset_id int NOT N… "{\n…
3 3 diclofenac "CREATE TABLE #Codesets (\r\n codeset_id int NOT N… "{\n…
4 4 nsaids "CREATE TABLE #Codesets (\r\n codeset_id int NOT N… "{\n…
# Create the cohort tables to hold the cohort generation results
cohortTableNames <- CohortGenerator::getCohortTableNames(cohortTable = "my_cohort_table")
CohortGenerator::createCohortTables(connectionDetails = connectionDetails,
cohortDatabaseSchema = "main",
cohortTableNames = cohortTableNames)Creating cohort tables
- Created table main.my_cohort_table
- Created table main.my_cohort_table_inclusion
- Created table main.my_cohort_table_inclusion_result
- Created table main.my_cohort_table_inclusion_stats
- Created table main.my_cohort_table_summary_stats
- Created table main.my_cohort_table_censor_stats
Creating cohort tables took 0.08secs
# Generate the cohorts
cohortsGenerated <- CohortGenerator::generateCohortSet(connectionDetails = connectionDetails,
cdmDatabaseSchema = "main",
cohortDatabaseSchema = "main",
cohortTableNames = cohortTableNames,
cohortDefinitionSet = cohortsToCreate)/4- Generating cohort:
Error in .main(): 1 assertions failed:
* Variable 'sql': Must have length 1, but has length 0.
# find parameters in ohdsi-sql
unique(stringr::str_extract_all(cohortsToCreate$sql[1], "@\\w+")[[1]])[1] "@vocabulary_database_schema" "@cdm_database_schema"
[3] "@target_database_schema" "@target_cohort_table"
[5] "@target_cohort_id" "@results_database_schema"
for (i in seq_len(nrow(cohortsToCreate))) {
sql <- cohortsToCreate$sql[i]
id <- cohortsToCreate$cohortId[i]
sql <- SqlRender::render(sql,
vocabulary_database_schema = "main",
cdm_database_schema = "main",
target_database_schema = "main",
target_cohort_table = cohortTableNames$cohortTable,
target_cohort_id = id,
results_database_schema = "main") %>%
SqlRender::translate("sqlite")
executeSql(connection, sql)
}
|
| | 0%
|
|=== | 4%
|
|====== | 8%
|
|======== | 12%
|
|=========== | 16%
|
|============== | 20%
|
|================= | 24%
|
|==================== | 28%
|
|====================== | 32%
|
|========================= | 36%
|
|============================ | 40%
|
|=============================== | 44%
|
|================================== | 48%
|
|==================================== | 52%
|
|======================================= | 56%
|
|========================================== | 60%
|
|============================================= | 64%
|
|================================================ | 68%
|
|================================================== | 72%
|
|===================================================== | 76%
|
|======================================================== | 80%
|
|=========================================================== | 84%
|
|============================================================== | 88%
|
|================================================================ | 92%
|
|=================================================================== | 96%
|
|======================================================================| 100%
|
| | 0%
|
|=== | 4%
|
|====== | 8%
|
|======== | 12%
|
|=========== | 16%
|
|============== | 20%
|
|================= | 24%
|
|==================== | 28%
|
|====================== | 32%
|
|========================= | 36%
|
|============================ | 40%
|
|=============================== | 44%
|
|================================== | 48%
|
|==================================== | 52%
|
|======================================= | 56%
|
|========================================== | 60%
|
|============================================= | 64%
|
|================================================ | 68%
|
|================================================== | 72%
|
|===================================================== | 76%
|
|======================================================== | 80%
|
|=========================================================== | 84%
|
|============================================================== | 88%
|
|================================================================ | 92%
|
|=================================================================== | 96%
|
|======================================================================| 100%
|
| | 0%
|
|=== | 4%
|
|====== | 8%
|
|======== | 12%
|
|=========== | 16%
|
|============== | 20%
|
|================= | 24%
|
|==================== | 28%
|
|====================== | 32%
|
|========================= | 36%
|
|============================ | 40%
|
|=============================== | 44%
|
|================================== | 48%
|
|==================================== | 52%
|
|======================================= | 56%
|
|========================================== | 60%
|
|============================================= | 64%
|
|================================================ | 68%
|
|================================================== | 72%
|
|===================================================== | 76%
|
|======================================================== | 80%
|
|=========================================================== | 84%
|
|============================================================== | 88%
|
|================================================================ | 92%
|
|=================================================================== | 96%
|
|======================================================================| 100%
|
| | 0%
|
|=== | 4%
|
|====== | 8%
|
|======== | 12%
|
|=========== | 16%
|
|============== | 20%
|
|================= | 24%
|
|==================== | 28%
|
|====================== | 32%
|
|========================= | 36%
|
|============================ | 40%
|
|=============================== | 44%
|
|================================== | 48%
|
|==================================== | 52%
|
|======================================= | 56%
|
|========================================== | 60%
|
|============================================= | 64%
|
|================================================ | 68%
|
|================================================== | 72%
|
|===================================================== | 76%
|
|======================================================== | 80%
|
|=========================================================== | 84%
|
|============================================================== | 88%
|
|================================================================ | 92%
|
|=================================================================== | 96%
|
|======================================================================| 100%
# Get the cohort counts
cohortCounts <- CohortGenerator::getCohortCounts(connectionDetails = connectionDetails,
cohortDatabaseSchema = "main",
cohortTable = cohortTableNames$cohortTable)Counting cohorts took 0.0457 secs
print(cohortCounts) cohortId cohortEntries cohortSubjects
1 1 479 479
2 2 1800 1800
3 3 830 830
4 4 2630 2630
The default exit is “end of observation period”.
Let’s define the cohort exit as one day the cohort entry and capture all events including the first one.
nsaidsCohort <- cohort(entry = entry(drug(nsaids), primaryCriteriaLimit = "All"),
exit = exit(fixedExit("startDate", 1L)))
nsaidsCohortFormal class 'Cohort' [package "Capr"] with 4 slots
..@ entry :Formal class 'CohortEntry' [package "Capr"] with 5 slots
..@ attrition:Formal class 'CohortAttrition' [package "Capr"] with 2 slots
..@ exit :Formal class 'CohortExit' [package "Capr"] with 2 slots
..@ era :Formal class 'CohortEra' [package "Capr"] with 3 slots
Now we will add two inclusion criteria
diclofenac <- cs(descendants(1124300), name = "diclofenac")
nsaid <- cs(descendants(21603933), name = "nsaid")
cancer <- cs(descendants(443392), name = "cancer")
ch <- cohort(
entry = entry(drugEra(diclofenac, age(gte(16))),
observationWindow = continuousObservation(-365L, 0L)),
attrition = attrition(
"no prior nsaid or cancer" = withAll(
exactly(0, drug(nsaid), eventStarts(-Inf, 0, index = "startDate")),
exactly(0, condition(cancer), eventStarts(-Inf, 0, index = "startDate"))
)
)
)
cohortSet <- list("diclofenac_without_prior_nsaid_or_cancer" = ch)Capr really shines when you need to create a large number of cohorts programatically
Create one cohort for every ATC drug class
Capr has lots of options that exactly match the options provided by the Atlas web application. Atlas is a great tool for creating a few cohorts in a graphical user interface. Capr complements Atlas providing tools to create a large number of cohorts easily in R.
Create your own cohort with Capr and generate it on an example CDM. The example Eunomia dataset does not have very much data in it so you can try one of the CDM datasets available here.
Download one of the duckdb files and connect to it like so. Note you will need the latest version of duckdb (0.8)
connectionDetails <- createConnectionDetails("duckdb",
server = "~/Desktop/synthea_cdms/allergies_cdm.duckdb")
connection <- connect(connectionDetails)
getTableNames(connection)
disconnect(connection)