# Supplementary information # R code for all figures # Establishing Digital Reference Material Documentation Infrastructure for Chemical Metrology # Bruno Garrido, Tanishka Ghosh, Daniel Yang, Patricia LeBlanc, Marcin Paluch, Sophie Roy, Pearse McCarron, Zoltán Mester, and Juris Meija # email: juris.meija@nrc-cnrc.gc.ca # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # FIGURE 2 #### # Measurement results across the entire NRC CRM catalogue. # Each dot corresponds to a certified value and the results are compared # with the classical benchmark known as the Horwitz curve (red line) # that relates the relative reproducibility uncertainty (s_R/%) # with the mass fraction of the analytes (w) as log2[sR/%] = 1 - 0.5*log10[w/(g/g)] library(xml2) # step 1 # Retrieve all records from the CRM catalogue xml.url = 'https://nrc-digital-repository.canada.ca/eng/search/atom/?q=*&cn=crm&av=1' xml.doc = read_html(xml.url) xml.ids = xml_find_all(xml.doc, ".//id")[-1] xml.ids = gsub('urn:uuid:', '', xml_text(xml.ids)) xml.name = sapply(1:length(xml.ids), function(nr){ xml.url = paste0('https://nrc-digital-repository.canada.ca/eng/view/object/?id=', xml.ids[nr]) str_split(xml_text(xml_find_first(read_html(xml.url), './/title')),"[:]")[[1]][1] }) xml.ax = lapply(1:length(xml.ids), function(nr){ xml.url = paste0('https://nrc-digital-repository.canada.ca/eng/view/ax/?id=', xml.ids[nr]) d = tryCatch(read_xml(xml.url), error = function(e) NA) d }) # step 3 # Parse all XML files # Location of the analytexml schema namespace ns <- c(d1 = "http://dr-dn.nrc-cnrc.gc.ca/analytexml") parse_analyte_xml <- function(nr) { doc = xml.ax[[nr]] if(is.na(doc)) return(NULL) # Extract material-level info material_name <- xml_text(xml_find_first(doc, ".//d1:material/d1:name/d1:term[@xml:lang='en']", ns)) if(is.na(material_name)) material_name = xml.names[nr] material_doi <- xml_text(xml_find_first(doc, ".//d1:material/d1:identifier[@type='doi']", ns)) validity_date <- xml_text(xml_find_first(doc, ".//d1:validity/d1:date[@point='start']", ns)) storage_temp <- xml_text(xml_find_first(doc, ".//d1:validity/d1:storage/d1:temperature/d1:value", ns)) # Extract analyte nodes analytes <- xml_find_all(doc, ".//d1:analyte", ns) # Parse analyte information parsed <- map_dfr(analytes, function(a) { tibble( material_name = material_name, material_doi = material_doi, validity_start = validity_date, storage_temp = storage_temp, analyte_name_en = xml_text(xml_find_first(a, ".//d1:name/d1:term[@xml:lang='en']", ns)), analyte_inchikey = xml_text(xml_find_first(a, ".//d1:identifier[@type='InChIKey']", ns)), quantity_en = xml_text(xml_find_first(a, ".//d1:amount/d1:quantity/d1:term[@xml:lang='en']", ns)), value = xml_text(xml_find_first(a, ".//d1:amount/d1:value", ns)), unit = xml_text(xml_find_first(a, ".//d1:amount/d1:unit", ns)), uncertainty = xml_text(xml_find_first(a, ".//d1:amount/d1:uncertainty[@type='standard']", ns)), type_en = xml_text(xml_find_first(a, ".//d1:amount/d1:type/d1:term[@xml:lang='en']", ns)) ) }) return(parsed) } # Apply the parse_analyte_xml( function to all XML files df_all <- map_dfr(1:length(xml.ax), parse_analyte_xml) # Select mass fraction of the analytes and retain only certified values df_all = subset(df_all, quantity_en == 'mass fraction' & type_en == 'certified' & !is.na(as.double(value))) # step 4 # Convert all measurement units into base unit kg/kg require(units) df_all$factor = sapply(df_all$unit, function(x) as.double(set_units(as_units(x)))) df_all$value_base = as.double(df_all$value) * df_all$factor # step 5 # Plot all certified values plot(x=df_all$value_base, y=100*as.double(df_all$uncertainty)/as.double(df_all$value), log='x', pch=21, col='steelblue4', cex=1.3, ylim=c(1,25), xaxt='n', yaxt='n', xlab = expression('Mass fraction, '*italic(w)), ylab = expression('Relative uncertainty, '*italic(u)*'('*italic(w)*')/'*italic(w)) ) axis(side=1, at = c(1e-9,1e-6,1e-3), labels = c('1 ng/g', '1 \u00b5g/g', '1 mg/g')) axis(side=2, at = c(0,5,10,15,20,25), labels = c('0%', '5%', '10%', '15%', '20%', '25%')) x_horwitz = 10^seq(from=-11, to=0, length.out = 100) s_r_horwitz = 2^(1 - 0.5 * log10(x_horwitz)) lines(x=x_horwitz, y=s_r_horwitz, col=2, lwd=2) box() # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # FIGURE 3 #### # Set of reference materials containing arsenic, cadmium, lead, and mercury, # all below a specified level # step 1 # Filter reference materials that contain certified values for arsenic, cadmium, lead and mercury require(dplyr) df_filter = subset(df_all, df_all$analyte_name_en %in% c('arsenic','cadmium','lead','mercury'), select = c('material_name','analyte_name_en','value_base')) df_filter_wide = df_filter %>% pivot_wider(names_from = analyte_name_en, values_from = value_base) # function to filter by the analyte levels filter_4_elements = function(x){ df_filter_wide$arsenic < x & df_filter_wide$cadmium < x unique(subset(df_filter_wide$material_name, df_filter_wide$arsenic < x & df_filter_wide$cadmium < x & df_filter_wide$lead < x & df_filter_wide$mercury < x)) } # plot the results with varying threshold for the four analytes (values) values = c(500,10,0.1,0.01) plot(x=0.01, y=1, pch='',xlim=c(0.01,500), log='x', ylim=c(0,13), bty='n', xaxt='n', yaxt='n', xpd=T, xlab=expression('Mass fraction threshold, '*italic(X)), ylab='') text(x=0.5,y=9, expression(italic(w)*'(As,Cd,Pb,Hg)/(mg/kg) < '*italic(X))) for(i in 1:length(values)){ materials = filter_4_elements(values[i]*1e-6) for(j in 1:length(materials)) text(x=values[i], y=j, materials[j], adj=0.5, xpd=T, font=2, col='steelblue4') } axis(side=1, at=c(0.01,0.1,10,500), labels=c(0.01,0.1,10,500)) # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # FIGURE 4 #### # Polarity-size plot of substances from the NRC CRM catalogue whose quadrants # mark organic substances of low and high polarity and molar mass # namespace for the PubChem pug_rest schema ns2 <- c(d2 = "http://pubchem.ncbi.nlm.nih.gov/pug_rest") # function to fetch molar mass and polarity from the inchikey get_mw_xlogp = function(inchikey){ res = tryCatch(read_xml(paste0("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/", inchikey, "/property/XLogP,MolecularWeight/XML")), error = function(e) NULL) if(is.null(res)) return(c(NULL,NULL)) mw = xml_text(xml_find_first(res, ".//d2:MolecularWeight", ns2)) xlogp = xml_text(xml_find_first(res, ".//d2:XLogP", ns2)) return(c(MW = as.double(mw), XLOGP = as.double(xlogp))) } # obtain molecular weight and polarity data from all unique inchikeys in the NRC Digital Repository records res = sapply(unique(df_all$analyte_inchikey), get_mw_xlogp) # plot the results plot(x = res[,'XLOGP'], y=res[,'MW'], pty='s', pch = 19, col = 'steelblue4', xlab=expression('log'*italic(P)['OW']), ylab=expression(italic(M)*'/(g/mol)') ) abline(h=300, v = -2, col=2) box() # end