It seems like the website has duplicated data. You can see it, if you fetch the full data set and then use distinct -> it only exposes 1000 distinct observations. You can see where these duplicates occur if we add the page to each row.
library(httr2)
library(purrr) # for map to unnest response list
library(dplyr) # for bind_rows
get_comp_data_from_all_pages <- \(max_pages = 40) {
get_page_data <- \(page_num) {
request_body <- list(
pageNumber = page_num,
industryPath = "beverage_manufacturing",
countryIsoTwoCode = "br"
)
cat("Fetching page", page_num, "...\n")
tryCatch({
res <- request("https://www.dnb.com/business-directory/api/companyinformation") |>
req_method("POST") |>
req_headers(
"Accept" = "application/json, text/plain, */*",
"Accept-Language" = "en,de-DE;q=0.9,de;q=0.8,en-US;q=0.7,zh-CN;q=0.6,zh;q=0.5",
"Cache-Control" = "no-cache",
"User-Agent" = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Referer" = "https://www.dnb.com/"
) |>
req_body_json(request_body) |>
req_throttle(rate = 2) |>
req_perform() |>
resp_body_json()
companies <- res[["companyInformationCompany"]]
if (length(companies) == 0) {
cat("No more data on page", page_num, "\n")
return(NULL)
}
companies |>
purrr::map(~ {
list(
company_name = .x$primaryName,
sales_revenue = .x$salesRevenue,
country = .x$primaryAddress$addressCountry$countryName,
city = .x$primaryAddress$addressLocality$name,
region = .x$primaryAddress$addressRegion$name,
postal_code = .x$primaryAddress$postalCode,
street = .x$primaryAddress$streetAddress$line1,
page = page_num
)
}) |>
dplyr::bind_rows()
}, error = function(e) {
cat("Error on page", page_num, ":", e$message, "\n")
return(NULL)
})
}
# iterate function over pages
all_data <- map(1:max_pages, get_page_data) |>
compact() |> # Remove NULL results
bind_rows()
}
res <- get_comp_data_from_all_pages()
res2 <- res %>% distinct() # only gives 1000 rows

It makes me question their data quality and if these 14,387 records really exist.