2018-08-28-birds-ocr.utf8.md

Free natural history images and appropriate R tooling!

OCR bird naming workflow, piece by piece

Image preparation

library("magrittr")
filenames <- fs::dir_ls("birds")

magick::image_read(filenames[1])

crop_bird <- function(filename){
  image <- magick::image_read(filename)
  
  height <- magick::image_info(image)$height
  
  # crop the top of the image
  image <- magick::image_crop(image, 
                     paste0("+0+",round(0.75*height))) %>%
    # convert the image to black and white
    magick::image_convert(type = "grayscale") %>%
    # increase brightness
    magick::image_modulate(brightness = 120) %>%
    magick::image_enhance() %>%
    magick::image_median() %>%
    magick::image_contrast() 
  
  # we'll need the filename later
  attr(image, "filename") <- filename
  
  return(image)
}

crop_bird(filenames[1])

Text extraction

get_names <- function(image){
  filename <- attr(image, "filename")
  ocr_options <- list(tessedit_pageseg_mode = 1)
   
  text <- magick::image_ocr(image, options = ocr_options)
  text <- stringr::str_split(text, "\n", simplify = TRUE)
  text <- stringr::str_remove_all(text, "[0-9]")
  text <- stringr::str_remove_all(text, "[:punct:]")
  text <- trimws(text)
  text <- stringr::str_remove_all(text, "~")
  text <- text[text != ""]
  text <- tolower(text)
  
  # remove one letter words
  # https://stackoverflow.com/questions/31203843/r-find-and-remove-all-one-to-two-letter-words
  text <- stringr::str_remove_all(text, " *\\b[[:alpha:]]{1,2}\\b *")
  text <- text[text != ""]
  
  # keep only the words that are recognized as either Latin
  # or English by cld2 or cld3
  if(length(text) > 0){
    results <- tibble::tibble(text = text,
                 cld2 = cld2::detect_language(text),
                 cld3 = cld3::detect_language(text),
                 filename = filename)
  
  results[results $cld2 %in% c("la", "en") |
          results$cld3 %in% c("la", "en"),]
  }else{
    return(NULL)
  }
  
  
}

(results1 <- filenames[1] %>%
  magick::image_read() %>%
  get_names())

## NULL

(results2 <- filenames[1] %>%
  crop_bird() %>%
  get_names())

## # A tibble: 2 x 4
##   text                 cld2  cld3  filename                          
##   <chr>                <chr> <chr> <fs::path>                        
## 1 climacteris picumnus <NA>  la    birds/n115_w1150_42399797481_o.jpg
## 2 brown tree creeper   en    <NA>  birds/n115_w1150_42399797481_o.jpg

Taxonomic name resolution

latin <- results2$text[results2$cld2 == "la"|
                         results2$cld3 == "la"]
taxize::gnr_resolve(latin,
  best_match_only = TRUE)

## # A tibble: 1 x 5
##   user_supplied_name submitted_name  matched_name   data_source_tit~ score
## * <chr>              <chr>           <chr>          <chr>            <dbl>
## 1 climacteris picum~ Climacteris pi~ Climacteris p~ NCBI             0.988

OCR bird naming workflow in action!

bird_names <- purrr::map(filenames, crop_bird) %>%
  purrr::map_df(get_names)

safe_resolve <- function(text){
  
  results <- taxize::gnr_resolve(text,
                                 best_match_only = TRUE)
  
  if(nrow(results) == 0){
    list(NULL)
  }else{
    list(results)
  }
}

bird_names <- dplyr::group_by(bird_names, text) %>%
  dplyr::mutate(gnr = ifelse(cld2 == "la" | cld3 == "la",
                             safe_resolve(text),
                             list(NULL)))

unique(bird_names$gnr)

## [[1]]
## # A tibble: 1 x 5
##   user_supplied_name submitted_name  matched_name   data_source_tit~ score
## * <chr>              <chr>           <chr>          <chr>            <dbl>
## 1 climacteris picum~ Climacteris pi~ Climacteris p~ NCBI             0.988
## 
## [[2]]
## [1] NA
## 
## [[3]]
## NULL
## 
## [[4]]
## # A tibble: 1 x 5
##   user_supplied_na~ submitted_name  matched_name  data_source_title  score
## * <chr>             <chr>           <chr>         <chr>              <dbl>
## 1 austrodicaeum ii~ Austrodicaeum ~ Austrodicaeu~ The Interim Regis~  0.75
## 
## [[5]]
## # A tibble: 1 x 5
##   user_supplied_name submitted_name  matched_name   data_source_tit~ score
## * <chr>              <chr>           <chr>          <chr>            <dbl>
## 1 melithreptus laet~ Melithreptus l~ Melithreptus ~ CU*STAR          0.988
## 
## [[6]]
## # A tibble: 1 x 5
##   user_supplied_na~ submitted_name matched_name  data_source_title   score
## * <chr>             <chr>          <chr>         <chr>               <dbl>
## 1 rad isdlvorniode  Rad isdlvorni~ Rad Baker & ~ The Interim Regist~  0.75

Conclusion

rOpenSci packages supporting this (and your) workflow

Applicability of this OCR bird naming workflow

More data from the Biodiversity Heritage Library

author <- rbhl::bhl_authorsearch("Gregory M Mathews")
books <- rbhl::bhl_getauthortitles(creatorid = author$CreatorID)
head(books$FullTitle)

## [1] "A manual of the birds of Australia,"                                                                                                                                                                                                                                          
## [2] "A list of the birds of the Phillipian sub-region : which do not occur in Australia. "                                                                                                                                                                                         
## [3] "A manual of the birds of Australia /"                                                                                                                                                                                                                                         
## [4] "A list of the birds of Australia : containing the names and synonyms connected with each genus, species, and subspecies of birds found in Australia, at present known to the author /"                                                                                        
## [5] "Austral avian record; a scientific journal devoted primarily to the study of the Australian avifauna."                                                                                                                                                                        
## [6] "Arcana, or, The museum of natural history : containing the most recent discovered objects : embellished with coloured plates, and corresponding descriptions : with extracts relating to animals, and remarks of celebrated travellers; combining a general survey of nature."

australia_birds <- rbhl::bhl_booksearch(title = "birds Australia")
head(australia_birds$FullTitle)

## [1] "Handbook to the birds of Australia. : [Supplementary material in Charles Darwin's copy]."
## [2] "An introduction to The birds of Australia /"                                             
## [3] "The useful birds of southern Australia : with notes on other birds /"                    
## [4] "The Birds of Australia"                                                                  
## [5] "The birds of Australia,"                                                                 
## [6] "The birds of Australia,"

library("magrittr")

# ocr=TRUE to extract OCR for all pages
rbhl::bhl_getitempages("250938", ocr = TRUE) %>%
  # for each page transform the type into a string
  dplyr::group_by(PageUrl) %>%
  dplyr::mutate(page_type = toString(PageTypes[[1]])) %>%
  # keep only the illustration pages
  # that are like the ones we used 
  dplyr::filter(page_type == "Illustration") %>%
  # from the data.frame extract the OCR
  dplyr::pull(OcrText) %>%
  head()

## [1] "491 \nFAL CUNCULUS LEUCOGASTER. \n( WHITE -BELLIED \243 If BIKE - TIT) \nFALCUNCULUS FRONTATUS. \nSHRIKE - TIT). \n"                                                                                                                   
## [2] "492 \nA** \nOREOICA GUTTURALIS. \n(CRESTED BELL-BIRD). \n"                                                                                                                                                                          
## [3] "APHELOCEPHALA LEUCOPSIS \n( WHITE FACE ). \n"                                                                                                                                                                                       
## [4] "* \nAPHELOCEPHALA PE CTORALIS. \n(CHE <3 TNUT -BREASTED WHITEFA CEj. \nAPHELOCEPHALA NIGRICINCTA. \n(BE A CK-BAH.DED WHITE FA CEj. \n"                                                                                              
## [5] "H . Gronvold. del. \nWitherLy & C\260 \nSPHENOSTOMA CRIS TATUM \n(WEDGE BIEL). \n"                                                                                                                                                     
## [6] "49 6 \nH \n(jronvolcl. del. \nN E O SIT TA LE CJ C O CE PHAI.A. \n( WHITE \246 HE AID EE THE EH UN HE FL). \nNEOSHTA ALBATA \n(F IE E> T Ft E EE UNNEFlj. \nNEOSITTA CHRYSOPTERA \nf OFi. A. NGE - wing-e d tree runner). \nWitWLjA \n"

Free natural history images and appropriate R tooling!

OCR bird naming workflow, piece by piece

Image preparation

Text extraction

Taxonomic name resolution

OCR bird naming workflow in action!

Conclusion

rOpenSci packages supporting this (and your) workflow

Applicability of this OCR bird naming workflow

More data from the Biodiversity Heritage Library

More birding soon!