# Loading packages and reading data ---------------------------------------library(tidyverse)library(cowplot)library(rgbif)library(geodata)library(sf)library(units)library(lubridate)theme_set(theme_cowplot()) # ggplot themeoptions(scipen=999) # Avoiding scientific notation of numbers# Retrieve data gbif dat<-occ_download_get("0035794-250920141307145",overwrite = F) |>occ_download_import() # https://doi.org/10.15468/dl.wuurkx# remove columns with only NAs. They were added by GBIF.dat <- dat |>select(-where(function(x) all(is.na(x))))# Check taxonomic match by gbifunmatched_taxa <- dat |>filter(scientificName =="incertae sedis") |>summarise(n=n(), .by= verbatimScientificName) |>arrange(desc(n))unmatched_taxa
# all matched
All species matched against GBIF Backbone
Spatial distribution
How many records per country?
records_per_country<- dat |>group_by(countryCode) |>summarise(number=n()) |>arrange(desc(number))records_per_country
Very few data points are outside of Germany.
Plot simple world map with data:
included_countries <- records_per_country$countryCode# download shapecountries <-world(resolution=4, path=".", level=0, version="latest")# convert to sf objectcountries<- sf::st_as_sf(countries)# make data records spatialprojcrs <-"+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"dat <-dat |>filter(!is.na(decimalLatitude) &!is.na(decimalLongitude)) #remove NAsdat <- dat |>st_as_sf(coords =c("decimalLongitude", "decimalLatitude"),crs = projcrs)# inspectggplot()+geom_sf(data= countries)+geom_sf(data = dat, col="red")
Crop to extent of data:
# Use European projectioncountries<- countries |>st_transform(crs =3035)dat<- dat |>st_transform(crs =3035)# bounding box around data bb <-st_bbox(dat) # zoom to area with databuffer <-50000# add 50 km buffer to bounding boxbb <- bb +c(-buffer, -buffer, buffer, buffer)#plotplot1<-ggplot()+geom_sf(data= countries, fill="#e0ecdf")+geom_sf(data = dat, col="#b2182b")+coord_sf(xlim =c(bb["xmin"], bb["xmax"]),ylim =c(bb["ymin"], bb["ymax"]),expand =0)plot1
Lets use a hexagonal grid instead, so it’s easier to see the point density:
# subset base map to bounding boxcountries <-st_intersection(countries, st_as_sfc(bb))
Warning: attribute variables are assumed to be spatially constant throughout
all geometries
# make hexagonal gridgrid <-st_make_grid(countries, square =FALSE,cellsize =set_units(40000, "m"))# convert grid to sfgrid <-st_as_sf(grid)# check which cells have data points in them and count theminter<-st_intersects(grid, dat)grid$records=lengths(inter) plot2<-ggplot()+geom_sf(data= countries, fill="#e0ecdf")+geom_sf(data =filter(grid, records>0), # only cells with dataaes(fill= records), alpha=1)+# geom_sf(data= filter(countries, COUNTRY=="Germany"), fill=NA, color="black")+scale_fill_viridis_c(option="magma",trans ="log", direction=-1,breaks =c(1,10,100,3000),name="# Records")+theme(legend.position ="right")plot2
Temporal distribution
Count records per year and plot barplot
data_per_year<- dat |>group_by(year) |>summarise(number=n())# inspect earliest yearshead(data_per_year)
# inspect latest yearstail(data_per_year)
The earliest data are from 1573. Not sure if that is correct. The latest are from 2025. There are some records without year information
# plot data plot3<- data_per_year |>ggplot()+geom_bar(aes(x=year, y=number), stat="identity", fill ="#4daf4a")+labs(y="# Records", x="Year")plot3
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_bar()`).
It’s hard to see the few historical data. I’ll facet the data along the Year axis
# Splitting the data data_per_year <- data_per_year %>%mutate(period =case_when( year <1880~"Before 1880", year <1980~"1880 - 1980",TRUE~"1980 – Present" ),period =factor(period, levels =c("Before 1880","1880 - 1980", "1980 – Present")) )plot3<- data_per_year |>ggplot()+geom_bar(aes(x=year, y=number), stat="identity", fill ="#4daf4a")+labs(y="# Records", x="Year")+facet_wrap(~ period, nrow =1, scales ="free")plot3
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_bar()`).
# count records per period for captiondata_per_period <-data_per_year |>group_by(period) |>summarise(total=sum(number))data_per_period
Figure 1: Geographic and temporal scope of the data set. A Spatial distribution and number of records based on a 40 km hexagonal grid. B Temporal distribution as number of records per year. For better readability the data are facetted into three time periods: Before 1880 (63 records), 1880–1980 (464 records), 1980 – Present (53123 records).