Free natural history images and appropriate R tooling!

OCR bird naming workflow, piece by piece

Image preparation

library("magrittr")
filenames <- fs::dir_ls("birds")
magick::image_read(filenames[1])

crop_bird <- function(filename){
  image <- magick::image_read(filename)
  
  height <- magick::image_info(image)$height
  
  # crop the top of the image
  image <- magick::image_crop(image, 
                     paste0("+0+",round(0.75*height))) %>%
    # convert the image to black and white
    magick::image_convert(type = "grayscale") %>%
    # increase brightness
    magick::image_modulate(brightness = 120) %>%
    magick::image_enhance() %>%
    magick::image_median() %>%
    magick::image_contrast() 
  
  # we'll need the filename later
  attr(image, "filename") <- filename
  
  return(image)
}

crop_bird(filenames[1])

Text extraction

get_names <- function(image){
  filename <- attr(image, "filename")
  ocr_options <- list(tessedit_pageseg_mode = 1)
   
  text <- magick::image_ocr(image, options = ocr_options)
  text <- stringr::str_split(text, "\n", simplify = TRUE)
  text <- stringr::str_remove_all(text, "[0-9]")
  text <- stringr::str_remove_all(text, "[:punct:]")
  text <- trimws(text)
  text <- stringr::str_remove_all(text, "~")
  text <- text[text != ""]
  text <- tolower(text)
  
  # remove one letter words
  # https://stackoverflow.com/questions/31203843/r-find-and-remove-all-one-to-two-letter-words
  text <- stringr::str_remove_all(text, " *\\b[[:alpha:]]{1,2}\\b *")
  text <- text[text != ""]
  
  # keep only the words that are recognized as either Latin
  # or English by cld2 or cld3
  if(length(text) > 0){
    results <- tibble::tibble(text = text,
                 cld2 = cld2::detect_language(text),
                 cld3 = cld3::detect_language(text),
                 filename = filename)
  
  results[results $cld2 %in% c("la", "en") |
          results$cld3 %in% c("la", "en"),]
  }else{
    return(NULL)
  }
  
  
}

(results1 <- filenames[1] %>%
  magick::image_read() %>%
  get_names())
## NULL
(results2 <- filenames[1] %>%
  crop_bird() %>%
  get_names())
## # A tibble: 2 x 4
##   text                 cld2  cld3  filename                          
##   <chr>                <chr> <chr> <fs::path>                        
## 1 climacteris picumnus <NA>  la    birds/n115_w1150_42399797481_o.jpg
## 2 brown tree creeper   en    <NA>  birds/n115_w1150_42399797481_o.jpg

Taxonomic name resolution

latin <- results2$text[results2$cld2 == "la"|
                         results2$cld3 == "la"]
taxize::gnr_resolve(latin,
  best_match_only = TRUE)
## # A tibble: 1 x 5
##   user_supplied_name submitted_name  matched_name   data_source_tit~ score
## * <chr>              <chr>           <chr>          <chr>            <dbl>
## 1 climacteris picum~ Climacteris pi~ Climacteris p~ NCBI             0.988

OCR bird naming workflow in action!

bird_names <- purrr::map(filenames, crop_bird) %>%
  purrr::map_df(get_names)
safe_resolve <- function(text){
  
  results <- taxize::gnr_resolve(text,
                                 best_match_only = TRUE)
  
  if(nrow(results) == 0){
    list(NULL)
  }else{
    list(results)
  }
}

bird_names <- dplyr::group_by(bird_names, text) %>%
  dplyr::mutate(gnr = ifelse(cld2 == "la" | cld3 == "la",
                             safe_resolve(text),
                             list(NULL)))
unique(bird_names$gnr)
## [[1]]
## # A tibble: 1 x 5
##   user_supplied_name submitted_name  matched_name   data_source_tit~ score
## * <chr>              <chr>           <chr>          <chr>            <dbl>
## 1 climacteris picum~ Climacteris pi~ Climacteris p~ NCBI             0.988
## 
## [[2]]
## [1] NA
## 
## [[3]]
## NULL
## 
## [[4]]
## # A tibble: 1 x 5
##   user_supplied_na~ submitted_name  matched_name  data_source_title  score
## * <chr>             <chr>           <chr>         <chr>              <dbl>
## 1 austrodicaeum ii~ Austrodicaeum ~ Austrodicaeu~ The Interim Regis~  0.75
## 
## [[5]]
## # A tibble: 1 x 5
##   user_supplied_name submitted_name  matched_name   data_source_tit~ score
## * <chr>              <chr>           <chr>          <chr>            <dbl>
## 1 melithreptus laet~ Melithreptus l~ Melithreptus ~ CU*STAR          0.988
## 
## [[6]]
## # A tibble: 1 x 5
##   user_supplied_na~ submitted_name matched_name  data_source_title   score
## * <chr>             <chr>          <chr>         <chr>               <dbl>
## 1 rad isdlvorniode  Rad isdlvorni~ Rad Baker & ~ The Interim Regist~  0.75

Conclusion

rOpenSci packages supporting this (and your) workflow

  • Applicability of this OCR bird naming workflow

More data from the Biodiversity Heritage Library

author <- rbhl::bhl_authorsearch("Gregory M Mathews")
books <- rbhl::bhl_getauthortitles(creatorid = author$CreatorID)
head(books$FullTitle)
## [1] "A manual of the birds of Australia,"                                                                                                                                                                                                                                          
## [2] "A list of the birds of the Phillipian sub-region : which do not occur in Australia. "                                                                                                                                                                                         
## [3] "A manual of the birds of Australia /"                                                                                                                                                                                                                                         
## [4] "A list of the birds of Australia : containing the names and synonyms connected with each genus, species, and subspecies of birds found in Australia, at present known to the author /"                                                                                        
## [5] "Austral avian record; a scientific journal devoted primarily to the study of the Australian avifauna."                                                                                                                                                                        
## [6] "Arcana, or, The museum of natural history : containing the most recent discovered objects : embellished with coloured plates, and corresponding descriptions : with extracts relating to animals, and remarks of celebrated travellers; combining a general survey of nature."
australia_birds <- rbhl::bhl_booksearch(title = "birds Australia")
head(australia_birds$FullTitle)
## [1] "Handbook to the birds of Australia. : [Supplementary material in Charles Darwin's copy]."
## [2] "An introduction to The birds of Australia /"                                             
## [3] "The useful birds of southern Australia : with notes on other birds /"                    
## [4] "The Birds of Australia"                                                                  
## [5] "The birds of Australia,"                                                                 
## [6] "The birds of Australia,"
library("magrittr")

# ocr=TRUE to extract OCR for all pages
rbhl::bhl_getitempages("250938", ocr = TRUE) %>%
  # for each page transform the type into a string
  dplyr::group_by(PageUrl) %>%
  dplyr::mutate(page_type = toString(PageTypes[[1]])) %>%
  # keep only the illustration pages
  # that are like the ones we used 
  dplyr::filter(page_type == "Illustration") %>%
  # from the data.frame extract the OCR
  dplyr::pull(OcrText) %>%
  head()
## [1] "491 \nFAL CUNCULUS LEUCOGASTER. \n( WHITE -BELLIED \243 If BIKE - TIT) \nFALCUNCULUS FRONTATUS. \nSHRIKE - TIT). \n"                                                                                                                   
## [2] "492 \nA** \nOREOICA GUTTURALIS. \n(CRESTED BELL-BIRD). \n"                                                                                                                                                                          
## [3] "APHELOCEPHALA LEUCOPSIS \n( WHITE FACE ). \n"                                                                                                                                                                                       
## [4] "* \nAPHELOCEPHALA PE CTORALIS. \n(CHE <3 TNUT -BREASTED WHITEFA CEj. \nAPHELOCEPHALA NIGRICINCTA. \n(BE A CK-BAH.DED WHITE FA CEj. \n"                                                                                              
## [5] "H . Gronvold. del. \nWitherLy & C\260 \nSPHENOSTOMA CRIS TATUM \n(WEDGE BIEL). \n"                                                                                                                                                     
## [6] "49 6 \nH \n(jronvolcl. del. \nN E O SIT TA LE CJ C O CE PHAI.A. \n( WHITE \246 HE AID EE THE EH UN HE FL). \nNEOSHTA ALBATA \n(F IE E> T Ft E EE UNNEFlj. \nNEOSITTA CHRYSOPTERA \nf OFi. A. NGE - wing-e d tree runner). \nWitWLjA \n"

More birding soon!