library(omopcept)
library(CodelistGenerator)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
DRAFT
TLDR
omopcept has recently gained a function
omop_drug_lookup_create()
to create a lookup table from
drug concept IDs to drug classes in the ATC classification. To create a
lookup table for all RxNorm Extension Ingredients :
drug_lookup = omop_drug_lookup_create()
Use case
Researchers want to be able to identify drug classes from an omop extract that has more specific drug names.
Maybe 1. antibiotics 1. broad spectrum antibiotics 1. specific antibiotic classes
ATC a WHO drug classification
ATC the Anatomical Therapeutic Chemical Classification System maintained by WHO and the Defined Daily Dose (DDD) as a measuring unit “have become the gold standard for international drug utilization monitoring and research”.
In ATC, active substances are classified in a hierarchy with five levels. The system has fourteen main anatomical/pharmacological groups or 1st levels. Each ATC main group is divided into 2nd levels which could be either pharmacological or therapeutic groups. The 3rd and 4th levels are chemical, pharmacological or therapeutic subgroups and the 5th level is the chemical substance.
ATC in OMOP via RxNorm Extension & RxNorm
In OMOP each drug concept ID in the vocabularies
RxNorm Extension
& RxNorm
has ancestor
concepts in the different levels of the ATC hierarchy. The new omopcept
function omop_drug_lookup_create()
creates a lookup table
from drug concept IDs to drug classes in the ATC classification.
Does anything else already do this ?
I couldn’t quite get anything to provide a comprehensive list of drug classes.
Seems that the CodelistGenerator package does things that could solve this.
CodelistGenerator::getATCCodes returns descendants of ATC classes.
@param level ATC level. Can be one or more of “ATC 1st”, “ATC 2nd”, “ATC 3rd”, “ATC 4th”, and “ATC 5th” @param name ATC name of interest. For example, c(“Dermatologicals”, “Nervous System”), would result in a list of length two with the descendant concepts for these two particular ATC groups.
cdm <- mockVocabRef()
#> Warning in validateCdmReference(cdm, soft = .softValidation): There are observation period end dates after the current date: 2025-06-30
#> ℹ The latest max observation period end date found is 2025-12-31
atc1 <- getATCCodes(cdm = cdm, level = "ATC 1st")
# atc2 <- getATCCodes(cdm = cdm, level = "ATC 2nd")
# Error in getATCCodes(cdm = cdm, level = "ATC 2nd") : 2 assertions failed:
# * - No matching ATC codes found
# * Variable 'atcCheck': Must be TRUE.
But these only show results that are in a CDM. I’d like comprehensive results to help me make sure its doing what I want before I apply it to any data.
# OLD exploration
# eval=FALSE because much of this replaced by omop_drug_lookup_create()
#300k rows
rxn <- omopcept::omop_names("", v="RxNorm")
omopfreqconceptclass(rxn)
# doesn't seem to have ATC classes I want ??
# concept_class_id n
# <chr> <int>
# 1 Clinical Drug 53083
# 2 Branded Drug 39083
# 3 Clinical Drug Comp 38474
# 4 Branded Drug Comp 36748
# 5 Branded Drug Form 26595
# 6 Branded Dose Group 23388
# 7 Clinical Drug Form 20773
# 8 Brand Name 19469
# 9 Clinical Dose Group 17741
# 10 Ingredient 15604
# 11 Multiple Ingredients 4469
# 12 Quant Clinical Drug 3860
# 13 Precise Ingredient 3476
# 14 Quant Branded Drug 3215
# 15 Branded Pack 1308
# 16 Clinical Pack 1175
# 17 Dose Form 201
# 18 Dose Form Group 47
# Aha, by looking at code for CodeListGenerator I see that the ATC classes are not in RxNorm but in a vocab called ATC
#6740 rows
atc <- omopcept::omop_names("", v="ATC")
omopfreqconceptclass(atc)
# 1 ATC 5th 5452
# 2 ATC 4th 911
# 3 ATC 3rd 269
# 4 ATC 2nd 94
# 5 ATC 1st 14
# TO be able to get ATC classes for any drug concept
# take the ancestor table
# filter all ancestor_concept_id that are in vocab ATC
# can then use filter any descendant_concept_id that appears in user data
# and should end up with vector of ATC classes at the different levels
#9 million ATC descendants !!
atc_descendants <- omop_concept_ancestor() |>
#ideally would do before collect
#but get Error in df[[id_col_name]] <- as.integer(df[[id_col_name]]) : cannot add bindings to a locked environment
#that I can probably fix in omopcept
#collect() |>
omop_join_name(namestart = "ancestor", columns = c("concept_name","vocabulary_id","concept_class_id")) |>
#renaming of joined columns to differentiate ancestor & descendant
rename(ancestor_vocabulary_id = vocabulary_id) |>
rename(ancestor_concept_class_id = concept_class_id) |>
omop_join_name(namestart = "descendant", columns = c("concept_name","vocabulary_id","concept_class_id")) |>
rename(descendant_vocabulary_id = vocabulary_id) |>
rename(descendant_concept_class_id = concept_class_id) |>
collect() |>
#head(100) |>
filter(ancestor_vocabulary_id=="ATC")
atc_descendants |> count(descendant_vocabulary_id, sort=TRUE)
# 1 RxNorm Extension 8732808
# 2 RxNorm 819887
# 3 ATC 30849
# 4 HCPCS 140
# 5 HemOnc 66
atc_descendants |> count(ancestor_concept_class_id, sort=TRUE)
#8.7m rows
atc_rxnormext_descendants <- atc_descendants |>
filter(descendant_vocabulary_id == "RxNorm Extension") |>
select(ATC_concept_id = ancestor_concept_id,
ATC_concept_name = ancestor_concept_name,
drug_concept_id = descendant_concept_id,
drug_concept_name = descendant_concept_name,
ATC_level = ancestor_concept_class_id,
#this is RxNormExtension concept class e.g. Branded Drug etc.
#probably don't need because it will be in drug_exposure table
#but may be useful here during development
concept_class_id = descendant_concept_class_id
) |>
#extract numeric part of the ATC level
mutate(ATC_level = str_sub(ATC_level,5,5))
atc_rxnormext_descendants |> count(concept_class_id, sort=TRUE)
# 1 Marketed Product 2692126
# 2 Branded Drug Box 1084960
# 3 Branded Drug 856819
# 4 Branded Drug Comp 710365
# 5 Branded Drug Form 569727
# 6 Clinical Drug Box 532821
# 7 Quant Branded Drug 467679
# 8 Quant Branded Box 411968
# 9 Quant Clinical Drug 346667
# 10 Clinical Drug 333783
# 11 Quant Clinical Box 324265
# 12 Clinical Drug Comp 200522
# 13 Clinical Drug Form 151217
# 14 Branded Pack 16291
# 15 Clinical Pack 12935
# 16 Ingredient 8911
# 17 Branded Pack Box 6604
# 18 Clinical Pack Box 5148
#SOMETHING NOT RIGHT ? SEEMS CAN'T FIND MANY DRUGS IN HERE
#TODO check in one of UCLH extracts what the values of RxNormExt concept_class_id are
#e.g. is it just Ingredient ?
atc_rxnormext_ingredient <- atc_rxnormext_descendants |>
filter(concept_class_id %in% c("Ingredient"))
atc2ingredients <- atc_rxnormext_ingredient |>
filter(ATC_level %in% c("ATC 1st"))
atc_rxnormext_clinicaldrug <- atc_rxnormext_descendants |>
filter(concept_class_id %in% c("Clinical Drug"))
atc_tmp <- atc_rxnormext_descendants |> head(1000)
atc_from_rxnormext_ingredient <- atc_rxnormext_ingredient |>
#atc_from_rxnormext_wide <- atc_rxnormext_descendants |>
#atc_from_rxnormext_wide <- atc_tmp |>
pivot_wider(names_from = ATC_level,
values_from = c(ATC_concept_name, ATC_concept_id),
id_cols=c(drug_concept_name, drug_concept_id))
# Warning message:
# Values from `ATC_concept_id` and `ATC_concept_name` are not uniquely identified; output will contain list-cols.
# Use the following dplyr code to identify duplicates.
# 800k rows !
#dup <- atc_rxnormext_descendants |>
dup <- atc_rxnormext_ingredient |>
dplyr::summarise(n = dplyr::n(), .by = c(drug_concept_name, drug_concept_id, ATC_level)) |>
dplyr::filter(n > 1L)
# SO a drug can be in more than one ATC class
# lets have a look at a single drug example
drugname <- "biapenem"
drugname <- "Bromisoval" #apparently has 37 duplicates !!
drugname <- "Azidamfenicol" #10 duplicates
oneingredient <- atc_rxnormext_descendants |>
filter(drug_concept_name == drugname) |>
arrange(ATC_level)
#biapenem has single entry per ATC level
# ATC_concept_id ATC_concept_name drug_concept_id drug_concept_name ATC_level concept_class_id
# 1 21602795 ANTIINFECTIVES FOR SYSTEMIC USE 35198093 biapenem ATC 1st Ingredient
# 2 21602796 ANTIBACTERIALS FOR SYSTEMIC USE 35198093 biapenem ATC 2nd Ingredient
# 3 21602868 OTHER BETA-LACTAM ANTIBACTERIALS 35198093 biapenem ATC 3rd Ingredient
# 4 21602920 Carbapenems 35198093 biapenem ATC 4th Ingredient
# 5 21602924 biapenem; parenteral 35198093 biapenem ATC 5th Ingredient
#but e.g. Azidamfenicol has multiple entries in all ATC levels
#suggests that maybe we need multiple rows per drug (i.e. keep data long)
#perhaps function can offer option for short & long ?
#OR maybe start by producing a long lookup table because we are close to that already
#
#even Bromisoval, most duplicates are in level 5
#oneingredient |> count(ATC_level, sort=TRUE)
# 1 ATC 5th 37
# 2 ATC 4th 6
# 3 ATC 3rd 4
# 4 ATC 1st 1
# 5 ATC 2nd 1
#TEMPTED to try to plot relationships of ATC levels
#probably 1-3
#I think we may want ATC (3rd level, pharmacological subgroup)
atc3_rxnormext_descendants <- atc_descendants |>
filter(descendant_vocabulary_id == "RxNorm Extension") |>
filter(ancestor_concept_class_id == "ATC 3rd")
atc2_rxnormext_descendants <- atc_descendants |>
filter(descendant_vocabulary_id == "RxNorm Extension") |>
filter(ancestor_concept_class_id == "ATC 2nd")
atc1_rxnormext_descendants <- atc_descendants |>
filter(descendant_vocabulary_id == "RxNorm Extension") |>
filter(ancestor_concept_class_id == "ATC 1st")
#234 rows at ATC3
freq_atc3 <- atc3_rxnormext_descendants |> count(ancestor_concept_name, sort=TRUE)
#90 rows at ATC2
freq_atc2 <- atc2_rxnormext_descendants |> count(ancestor_concept_name, sort=TRUE)
#14 rows at ATC1
freq_atc1 <- atc1_rxnormext_descendants |> count(ancestor_concept_name, sort=TRUE)
cyto <- atc3_rxnormext_descendants |>
filter(ancestor_concept_name == "CYTOTOXIC ANTIBIOTICS AND RELATED SUBSTANCES")
#amoxicillin etc.
beta <- atc3_rxnormext_descendants |>
filter(ancestor_concept_name == "BETA-LACTAM ANTIBACTERIALS, PENICILLINS")
#OTHER BETA-LACTAM ANTIBACTERIALS
#find what level2 is for level3= "BETA-LACTAM ANTIBACTERIALS, PENICILLINS"
beta_atc2 <- atc_descendants |>
filter(ancestor_concept_class_id == "ATC 2nd") |>
filter(descendant_concept_name == "BETA-LACTAM ANTIBACTERIALS, PENICILLINS")
anti_atc3 <- atc_descendants |>
filter(ancestor_concept_name == "ANTIBACTERIALS FOR SYSTEMIC USE" ) |>
filter(descendant_concept_class_id == "ATC 3rd")
anti_atc4 <- atc_descendants |>
filter(ancestor_concept_name == "ANTIBACTERIALS FOR SYSTEMIC USE" ) |>
filter(descendant_concept_class_id == "ATC 4th")
beta_atc1 <- atc_descendants |>
filter(ancestor_concept_class_id == "ATC 1st") |>
filter(descendant_concept_name == "BETA-LACTAM ANTIBACTERIALS, PENICILLINS")
a3_and_4 <- bind_rows(anti_atc3, anti_atc4)
#SO WHAT WE COULD DO is create a post-processing script that will add columns to the omop extracts for ATC2,3,& 4 likely to be most useful
#TODO
#write a function addATC_to_drug_concepts()
#want to add 1 column for each ATC level
#probably by filtering all ATC levels & then doing pivot_wider()