Bibentry for FAIR datasets

An important aim of the dataset package is to create native R objects where bibliographic metadata cannot be detached, thus ensuring Findability, Accessibility, Interoperability and Reusability in the long run. We provide an interface and methods to add metadata required by open data repositories according to the more general Dublin Core library metadata standard, or the more specific DataCite metadata standard.

Titles

dataset_title(iris_dataset)
#> [1] "Iris Dataset"

dataset_title(iris_dataset_2, overwrite=TRUE) <- "The Famous Iris Dataset"
get_bibentry(iris_dataset_2)
#> Anderson E (1935). "The Famous Iris Dataset."

Creators

The corresponds to dct:creator in Dublin Core and Creator in DataCite, the two most important metadata definitions for publishing datasets in repositories. They refer to the The name of the entity that holds, archives, publishes prints, distributes, releases, issues, or produces the dataset. This property will be used to formulate the citation.

creator(iris_dataset)
#> [1] "Edgar Anderson [aut]"

iris_dataset_2 <- iris_dataset
# Add a new creator, with overwriting existing authorship information:
creator(iris_dataset_2, overwrite=TRUE) <- person("Jane", "Doe", role = "aut")

# Add a new creator, without overwriting existing authorship information:
creator(iris_dataset_2, overwrite=FALSE) <- person("John", "Doe", role = "ctb")

# The two new creation contributors:
creator(iris_dataset_2)
#> [1] "Jane Doe [aut]" "John Doe [ctb]"

Further descriptive metadata about the whole dataset

Publication year

The publication year is usually one of the most important descriptive metadata in repositories and libraries:

publication_year(iris_dataset_2)
#> [1] "1935"

The default value is :unas for unassigned values:

# Revert to default (unassigned):
publication_year(iris_dataset_2) <- NULL

# Get the default value:
publication_year(iris_dataset_2) 
#> [1] ":unas"

Language

# Get the language:
language(iris_dataset)
#> [1] "en"

# Reset the language:
language(iris_dataset_2) <- "French"
language(iris_dataset_2)
#> [1] "fra"

Rights statement

# Add rights statement to the dataset
rights(iris_dataset_2, overwrite = TRUE)  <- "GNU-2"

Some metadata functions prevent accidental overwriting, except for the default :unas unassigned and :tba to-be-announced values.

rights(iris_dataset_2) <- "CC0"
#> The dataset has already a rights field: :tba
rights(iris_dataset_2)
#> [1] ":tba"

Overwriting the rights statement needs an explicit approval:

rights(iris_dataset_2, overwrite = TRUE)  <- "GNU-2"

DataCite currently allows the use of subproperties. For example, the Creative Commons Attribution 4.0 International would be described as:

list ( schemeURI="https://spdx.org/licenses/",
       rightsIdentifierScheme="SPDX",
       rightsIdentifier="CC-BY-4.0",
       rightsURI="https://creativecommons.org/licenses/by/4.0/")
#> $schemeURI
#> [1] "https://spdx.org/licenses/"
#> 
#> $rightsIdentifierScheme
#> [1] "SPDX"
#> 
#> $rightsIdentifier
#> [1] "CC-BY-4.0"
#> 
#> $rightsURI
#> [1] "https://creativecommons.org/licenses/by/4.0/"

The use of subproperties will be later implemented.

Description

The description is currently implemented as a character string. However, DataCite 4.6 states that if Description is used, descriptionType is mandatory. This will be implemented later.

<descriptions>
    <description xml:lang="en" descriptionType="Abstract">Example abstract</description>
</descriptions>

description(iris_dataset)
#> [1] "The famous (Fisher's or Anderson's) iris data set."

Subject

subject(iris_dataset)
#> $term
#> [1] "Irises (plants)"
#> 
#> $subjectScheme
#> [1] "LCCH"
#> 
#> $schemeURI
#> [1] "http://id.loc.gov/authorities/subjects"
#> 
#> $valueURI
#> [1] "https://id.loc.gov/authorities/subjects/sh85068079"
#> 
#> $classificationCode
#> NULL
#> 
#> $prefix
#> [1] "lcch:"
#> 
#> attr(,"class")
#> [1] "subject" "list"

<subjects>
  <subject xml:lang="en" subjectScheme="Library of Congress Subject Headings (LCSH)" schemeURI="https://id.loc.gov/authorities/subjects.html" valueURI="https://id.loc.gov/authorities/subjects/sh2009009655.html">Climate change mitigation</subject>
  <subject xml:lang="en" subjectScheme="ANZSRC Fields of Research" schemeURI="https://www.abs.gov.au/statistics/classifications/australian-and-new-zealand-standard-research-classification-anzsrc" classificationCode="370201">Climate change processes</subject>
</subject>

subject_create(
  term = "data sets", 
  subjectScheme = "Library of Congress Subject Headings (LCSH)", 
  schemeURI = "https://id.loc.gov/authorities/subjects.html",
  valueURI = "http://id.loc.gov/authorities/subjects/sh2018002256"
)
#> $term
#> [1] "data sets"
#> 
#> $subjectScheme
#> [1] "Library of Congress Subject Headings (LCSH)"
#> 
#> $schemeURI
#> [1] "https://id.loc.gov/authorities/subjects.html"
#> 
#> $valueURI
#> [1] "http://id.loc.gov/authorities/subjects/sh2018002256"
#> 
#> $classificationCode
#> NULL
#> 
#> $prefix
#> [1] ""
#> 
#> attr(,"class")
#> [1] "subject" "list"

Identifiers

# Add rights statement to the dataset
identifier(iris_dataset_2)
#> [1] "https://doi.org/10.5281/zenodo.10396807"

All bibliographic information

Get the metadata according to the DataCite definition:

print(as_datacite(iris_dataset), "Bibtex")
#> @Misc{,
#>   title = {Iris Dataset},
#>   author = {Edgar Anderson},
#>   identifier = {https://doi.org/10.5281/zenodo.10396807},
#>   publisher = {American Iris Society},
#>   year = {1935},
#>   date = {:tba},
#>   language = {en},
#>   alternateidentifier = {:unas},
#>   relatedidentifier = {:unas},
#>   format = {:unas},
#>   version = {:unas},
#>   rights = {:tba},
#>   description = {The famous (Fisher's or Anderson's) iris data set.},
#>   geolocation = {:unas},
#>   fundingreference = {:unas},
#> }

And according to DCTERMS (Dublin Core):

print(as_dublincore(iris_dataset), "Bibtex")
#> @Misc{,
#>   title = {Iris Dataset},
#>   author = {Edgar Anderson},
#>   identifier = {https://doi.org/10.5281/zenodo.10396807},
#>   publisher = {American Iris Society},
#>   year = {1935},
#>   language = {en},
#>   relation = {:unas},
#>   format = {:unas},
#>   rights = {:tba},
#>   description = {The famous (Fisher's or Anderson's) iris data set.},
#>   type = {DCMITYPE:Dataset},
#>   datasource = {https://doi.org/10.1111/j.1469-1809.1936.tb02137.x},
#>   coverage = {:unas},
#> }