library("magrittr")
filenames <- fs::dir_ls("birds")
magick::image_read(filenames[1])
crop_bird <- function(filename){
image <- magick::image_read(filename)
height <- magick::image_info(image)$height
# crop the top of the image
image <- magick::image_crop(image,
paste0("+0+",round(0.75*height))) %>%
# convert the image to black and white
magick::image_convert(type = "grayscale") %>%
# increase brightness
magick::image_modulate(brightness = 120) %>%
magick::image_enhance() %>%
magick::image_median() %>%
magick::image_contrast()
# we'll need the filename later
attr(image, "filename") <- filename
return(image)
}
crop_bird(filenames[1])
get_names <- function(image){
filename <- attr(image, "filename")
ocr_options <- list(tessedit_pageseg_mode = 1)
text <- magick::image_ocr(image, options = ocr_options)
text <- stringr::str_split(text, "\n", simplify = TRUE)
text <- stringr::str_remove_all(text, "[0-9]")
text <- stringr::str_remove_all(text, "[:punct:]")
text <- trimws(text)
text <- stringr::str_remove_all(text, "~")
text <- text[text != ""]
text <- tolower(text)
# remove one letter words
# https://stackoverflow.com/questions/31203843/r-find-and-remove-all-one-to-two-letter-words
text <- stringr::str_remove_all(text, " *\\b[[:alpha:]]{1,2}\\b *")
text <- text[text != ""]
# keep only the words that are recognized as either Latin
# or English by cld2 or cld3
if(length(text) > 0){
results <- tibble::tibble(text = text,
cld2 = cld2::detect_language(text),
cld3 = cld3::detect_language(text),
filename = filename)
results[results $cld2 %in% c("la", "en") |
results$cld3 %in% c("la", "en"),]
}else{
return(NULL)
}
}
(results1 <- filenames[1] %>%
magick::image_read() %>%
get_names())
## NULL
(results2 <- filenames[1] %>%
crop_bird() %>%
get_names())
## # A tibble: 2 x 4
## text cld2 cld3 filename
## <chr> <chr> <chr> <fs::path>
## 1 climacteris picumnus <NA> la birds/n115_w1150_42399797481_o.jpg
## 2 brown tree creeper en <NA> birds/n115_w1150_42399797481_o.jpg
latin <- results2$text[results2$cld2 == "la"|
results2$cld3 == "la"]
taxize::gnr_resolve(latin,
best_match_only = TRUE)
## # A tibble: 1 x 5
## user_supplied_name submitted_name matched_name data_source_tit~ score
## * <chr> <chr> <chr> <chr> <dbl>
## 1 climacteris picum~ Climacteris pi~ Climacteris p~ NCBI 0.988
bird_names <- purrr::map(filenames, crop_bird) %>%
purrr::map_df(get_names)
safe_resolve <- function(text){
results <- taxize::gnr_resolve(text,
best_match_only = TRUE)
if(nrow(results) == 0){
list(NULL)
}else{
list(results)
}
}
bird_names <- dplyr::group_by(bird_names, text) %>%
dplyr::mutate(gnr = ifelse(cld2 == "la" | cld3 == "la",
safe_resolve(text),
list(NULL)))
unique(bird_names$gnr)
## [[1]]
## # A tibble: 1 x 5
## user_supplied_name submitted_name matched_name data_source_tit~ score
## * <chr> <chr> <chr> <chr> <dbl>
## 1 climacteris picum~ Climacteris pi~ Climacteris p~ NCBI 0.988
##
## [[2]]
## [1] NA
##
## [[3]]
## NULL
##
## [[4]]
## # A tibble: 1 x 5
## user_supplied_na~ submitted_name matched_name data_source_title score
## * <chr> <chr> <chr> <chr> <dbl>
## 1 austrodicaeum ii~ Austrodicaeum ~ Austrodicaeu~ The Interim Regis~ 0.75
##
## [[5]]
## # A tibble: 1 x 5
## user_supplied_name submitted_name matched_name data_source_tit~ score
## * <chr> <chr> <chr> <chr> <dbl>
## 1 melithreptus laet~ Melithreptus l~ Melithreptus ~ CU*STAR 0.988
##
## [[6]]
## # A tibble: 1 x 5
## user_supplied_na~ submitted_name matched_name data_source_title score
## * <chr> <chr> <chr> <chr> <dbl>
## 1 rad isdlvorniode Rad isdlvorni~ Rad Baker & ~ The Interim Regist~ 0.75
author <- rbhl::bhl_authorsearch("Gregory M Mathews")
books <- rbhl::bhl_getauthortitles(creatorid = author$CreatorID)
head(books$FullTitle)
## [1] "A manual of the birds of Australia,"
## [2] "A list of the birds of the Phillipian sub-region : which do not occur in Australia. "
## [3] "A manual of the birds of Australia /"
## [4] "A list of the birds of Australia : containing the names and synonyms connected with each genus, species, and subspecies of birds found in Australia, at present known to the author /"
## [5] "Austral avian record; a scientific journal devoted primarily to the study of the Australian avifauna."
## [6] "Arcana, or, The museum of natural history : containing the most recent discovered objects : embellished with coloured plates, and corresponding descriptions : with extracts relating to animals, and remarks of celebrated travellers; combining a general survey of nature."
australia_birds <- rbhl::bhl_booksearch(title = "birds Australia")
head(australia_birds$FullTitle)
## [1] "Handbook to the birds of Australia. : [Supplementary material in Charles Darwin's copy]."
## [2] "An introduction to The birds of Australia /"
## [3] "The useful birds of southern Australia : with notes on other birds /"
## [4] "The Birds of Australia"
## [5] "The birds of Australia,"
## [6] "The birds of Australia,"
library("magrittr")
# ocr=TRUE to extract OCR for all pages
rbhl::bhl_getitempages("250938", ocr = TRUE) %>%
# for each page transform the type into a string
dplyr::group_by(PageUrl) %>%
dplyr::mutate(page_type = toString(PageTypes[[1]])) %>%
# keep only the illustration pages
# that are like the ones we used
dplyr::filter(page_type == "Illustration") %>%
# from the data.frame extract the OCR
dplyr::pull(OcrText) %>%
head()
## [1] "491 \nFAL CUNCULUS LEUCOGASTER. \n( WHITE -BELLIED \243 If BIKE - TIT) \nFALCUNCULUS FRONTATUS. \nSHRIKE - TIT). \n"
## [2] "492 \nA** \nOREOICA GUTTURALIS. \n(CRESTED BELL-BIRD). \n"
## [3] "APHELOCEPHALA LEUCOPSIS \n( WHITE FACE ). \n"
## [4] "* \nAPHELOCEPHALA PE CTORALIS. \n(CHE <3 TNUT -BREASTED WHITEFA CEj. \nAPHELOCEPHALA NIGRICINCTA. \n(BE A CK-BAH.DED WHITE FA CEj. \n"
## [5] "H . Gronvold. del. \nWitherLy & C\260 \nSPHENOSTOMA CRIS TATUM \n(WEDGE BIEL). \n"
## [6] "49 6 \nH \n(jronvolcl. del. \nN E O SIT TA LE CJ C O CE PHAI.A. \n( WHITE \246 HE AID EE THE EH UN HE FL). \nNEOSHTA ALBATA \n(F IE E> T Ft E EE UNNEFlj. \nNEOSITTA CHRYSOPTERA \nf OFi. A. NGE - wing-e d tree runner). \nWitWLjA \n"