# Supplementary information
# R code for all figures

# Establishing Digital Reference Material Documentation Infrastructure for Chemical Metrology
# Bruno Garrido, Tanishka Ghosh, Daniel Yang, Patricia LeBlanc, Marcin Paluch, Sophie Roy, Pearse McCarron, Zoltán Mester, and Juris Meija
# email: juris.meija@nrc-cnrc.gc.ca  

  
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #


# FIGURE 2 ####
# Measurement results across the entire NRC CRM catalogue. 
# Each dot corresponds to a certified value and the results are compared 
# with the classical benchmark known as the Horwitz curve (red line) 
# that relates the relative reproducibility uncertainty (s_R/%) 
# with the mass fraction of the analytes (w) as log2[sR/%] = 1 - 0.5*log10[w/(g/g)]

library(xml2)

# step 1 # Retrieve all records from the CRM catalogue

xml.url = 'https://nrc-digital-repository.canada.ca/eng/search/atom/?q=*&cn=crm&av=1'
xml.doc = read_html(xml.url)

xml.ids = xml_find_all(xml.doc, ".//id")[-1]
xml.ids = gsub('urn:uuid:', '', xml_text(xml.ids))

xml.name = sapply(1:length(xml.ids), function(nr){
  xml.url  = paste0('https://nrc-digital-repository.canada.ca/eng/view/object/?id=', xml.ids[nr])
  str_split(xml_text(xml_find_first(read_html(xml.url), './/title')),"[:]")[[1]][1]
})

xml.ax = lapply(1:length(xml.ids), function(nr){
  xml.url  = paste0('https://nrc-digital-repository.canada.ca/eng/view/ax/?id=', xml.ids[nr])
  d = tryCatch(read_xml(xml.url), error = function(e) NA)
  d
})

# step 3 # Parse all XML files

# Location of the analytexml schema namespace
ns <- c(d1 = "http://dr-dn.nrc-cnrc.gc.ca/analytexml")

parse_analyte_xml <- function(nr) {
  doc = xml.ax[[nr]]
  
  if(is.na(doc)) return(NULL)
  
  # Extract material-level info
  material_name <- xml_text(xml_find_first(doc, ".//d1:material/d1:name/d1:term[@xml:lang='en']", ns))
  if(is.na(material_name)) material_name = xml.names[nr]
  material_doi  <- xml_text(xml_find_first(doc, ".//d1:material/d1:identifier[@type='doi']", ns))
  validity_date <- xml_text(xml_find_first(doc, ".//d1:validity/d1:date[@point='start']", ns))
  storage_temp  <- xml_text(xml_find_first(doc, ".//d1:validity/d1:storage/d1:temperature/d1:value", ns))
  
  # Extract analyte nodes
  analytes <- xml_find_all(doc, ".//d1:analyte", ns)
  
  # Parse analyte information
  parsed <- map_dfr(analytes, function(a) {
    tibble(
      material_name = material_name,
      material_doi = material_doi,
      validity_start = validity_date,
      storage_temp = storage_temp,
      analyte_name_en = xml_text(xml_find_first(a, ".//d1:name/d1:term[@xml:lang='en']", ns)),
      analyte_inchikey = xml_text(xml_find_first(a, ".//d1:identifier[@type='InChIKey']", ns)),
      quantity_en = xml_text(xml_find_first(a, ".//d1:amount/d1:quantity/d1:term[@xml:lang='en']", ns)),
      value = xml_text(xml_find_first(a, ".//d1:amount/d1:value", ns)),
      unit = xml_text(xml_find_first(a, ".//d1:amount/d1:unit", ns)),
      uncertainty = xml_text(xml_find_first(a, ".//d1:amount/d1:uncertainty[@type='standard']", ns)),
      type_en = xml_text(xml_find_first(a, ".//d1:amount/d1:type/d1:term[@xml:lang='en']", ns))
    )
  })
  
  return(parsed)
}

# Apply the parse_analyte_xml( function to all XML files
df_all <- map_dfr(1:length(xml.ax), parse_analyte_xml)

# Select mass fraction of the analytes and retain only certified values
df_all = subset(df_all, quantity_en == 'mass fraction' & type_en == 'certified' & !is.na(as.double(value)))

# step 4 # Convert all measurement units into base unit kg/kg

require(units)

df_all$factor = sapply(df_all$unit, function(x) as.double(set_units(as_units(x))))
df_all$value_base = as.double(df_all$value) * df_all$factor

# step 5 # Plot all certified values

plot(x=df_all$value_base, y=100*as.double(df_all$uncertainty)/as.double(df_all$value), 
     log='x', pch=21, col='steelblue4', cex=1.3, ylim=c(1,25),
     xaxt='n', yaxt='n', 
     xlab = expression('Mass fraction, '*italic(w)),
     ylab = expression('Relative uncertainty,  '*italic(u)*'('*italic(w)*')/'*italic(w))     )
axis(side=1, at = c(1e-9,1e-6,1e-3), labels = c('1 ng/g', '1 \u00b5g/g', '1 mg/g'))
axis(side=2, at = c(0,5,10,15,20,25), labels = c('0%', '5%', '10%', '15%', '20%', '25%'))

x_horwitz = 10^seq(from=-11, to=0, length.out = 100)
s_r_horwitz = 2^(1 - 0.5 * log10(x_horwitz))
lines(x=x_horwitz, y=s_r_horwitz, col=2, lwd=2)

box()


# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #


# FIGURE 3 ####
# Set of reference materials containing arsenic, cadmium, lead, and mercury, 
# all below a specified level

# step 1 # Filter reference materials that contain certified values for arsenic, cadmium, lead and mercury

require(dplyr)
df_filter = subset(df_all, df_all$analyte_name_en %in% c('arsenic','cadmium','lead','mercury'), select = c('material_name','analyte_name_en','value_base'))
df_filter_wide = df_filter %>% pivot_wider(names_from = analyte_name_en, values_from = value_base)

# function to filter by the analyte levels

filter_4_elements = function(x){
  df_filter_wide$arsenic < x & df_filter_wide$cadmium < x
  unique(subset(df_filter_wide$material_name, df_filter_wide$arsenic < x & df_filter_wide$cadmium < x & df_filter_wide$lead < x & df_filter_wide$mercury < x))
}

# plot the results with varying threshold for the four analytes (values)

values = c(500,10,0.1,0.01)
plot(x=0.01, y=1, pch='',xlim=c(0.01,500), log='x', ylim=c(0,13), bty='n', xaxt='n', yaxt='n', xpd=T,
     xlab=expression('Mass fraction threshold, '*italic(X)), ylab='')
text(x=0.5,y=9, expression(italic(w)*'(As,Cd,Pb,Hg)/(mg/kg) < '*italic(X)))
for(i in 1:length(values)){
  materials = filter_4_elements(values[i]*1e-6)  
  for(j in 1:length(materials)) text(x=values[i], y=j, materials[j], adj=0.5, xpd=T, font=2, col='steelblue4')  
}
axis(side=1, at=c(0.01,0.1,10,500), labels=c(0.01,0.1,10,500))


# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #


# FIGURE 4 ####
# Polarity-size plot of substances from the NRC CRM catalogue whose quadrants 
# mark organic substances of low and high polarity and molar mass 

# namespace for the PubChem pug_rest schema
ns2 <- c(d2 = "http://pubchem.ncbi.nlm.nih.gov/pug_rest")

# function to fetch molar mass and polarity from the inchikey
get_mw_xlogp = function(inchikey){
  res = tryCatch(read_xml(paste0("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/", inchikey, "/property/XLogP,MolecularWeight/XML")), error = function(e) NULL)
  if(is.null(res)) return(c(NULL,NULL))
  mw = xml_text(xml_find_first(res, ".//d2:MolecularWeight", ns2))
  xlogp = xml_text(xml_find_first(res, ".//d2:XLogP", ns2))
  return(c(MW = as.double(mw), XLOGP = as.double(xlogp)))
}

# obtain molecular weight and polarity data from all unique inchikeys in the NRC Digital Repository records
res = sapply(unique(df_all$analyte_inchikey), get_mw_xlogp)

# plot the results
plot(x = res[,'XLOGP'], y=res[,'MW'], pty='s', pch = 19, col = 'steelblue4',
     xlab=expression('log'*italic(P)['OW']),
     ylab=expression(italic(M)*'/(g/mol)')
     )
abline(h=300, v = -2, col=2)
box()

# end