This is more of an outline how it can be done, please always repect the Terms of use of the website and make sure to use APIs where possible.
You can mimic the post requests the page makes under the hood, every time a page is loaded:

this will get you the company data for 50 companies / page:

In R you can use httr2 to make the post request sending the request body payload

Edit 1
Thanks to @margusl for pointing me towards req_perform_iterative() to perform iterative requests and resps_data() to build a dataframe from the response. We can then send multiple post requests until we reach max_pages to pull the available data from 40 pages.
library(httr2)
get_comp_data_from_all_pages <- \(max_pages = 40) {
request("https://www.dnb.com/business-directory/api/companyinformation") |>
req_method("POST") |>
req_headers(
"Accept" = "application/json, text/plain, */*",
"Accept-Language" = "en,de-DE;q=0.9,de;q=0.8,en-US;q=0.7,zh-CN;q=0.6,zh;q=0.5",
"Cache-Control" = "no-cache",
"User-Agent" = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Referer" = "https://www.dnb.com/"
) |>
req_body_json(list(
pageNumber = 1,
industryPath = "beverage_manufacturing",
countryIsoTwoCode = "br"
)) |>
req_throttle(rate = 2) |>
req_perform_iterative(
next_req = \(resp, req) {
page_num <- (resp$request$body$data$pageNumber %||% 1) + 1
if (page_num > max_pages) return(NULL)
companies <- resp_body_json(resp)[["companyInformationCompany"]]
if (length(companies) == 0) return(NULL)
req_body_json(req, list(
pageNumber = page_num,
industryPath = "beverage_manufacturing",
countryIsoTwoCode = "br"
))
},
max_reqs = max_pages
) |>
resps_data(\(resp) {
companies <- resp_body_json(resp)[["companyInformationCompany"]]
if (length(companies) == 0) return(NULL)
lapply(companies, rapply, f = c) |> # flatten list for binding
dplyr::bind_rows()
})
}
get_comp_data_from_all_pages()
giving
res
# A tibble: 2,000 × 18
duns primaryName primaryNameForUrl primaryAddress.addre…¹ primaryAddress.addre…² primaryAddress.addre…³ primaryAddress.addre…⁴
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 04210dcfb196… AMBEV S/A ambev_s-a BR Brazil Brazil SAO PAULO
2 a83c292319f8… CERVEJARIA… cervejaria_petro… BR Brazil Brazil RIO DE JANEIRO
3 e55d63d6827b… HNK BR IND… hnk_br_industria… BR Brazil Brazil ITU
4 14250804cf5e… SPAIPA IND… spaipa_industria… BR Brazil Brazil CURITIBA
5 3c6cadd49a54… NORSA REFR… norsa_refrigeran… BR Brazil Brazil FORTALEZA
6 da30ec9a8425… AROSUCO AR… arosuco_aromas_e… BR Brazil Brazil MANAUS
7 c88c496b7ac4… REFRESCOS … refrescos_guarar… BR Brazil Brazil JABOATAO DOS GUARARAP…
8 5d5fe1872a35… CERVEJARIA… cervejaria_petro… BR Brazil Brazil ALAGOINHAS
9 33218fc0ae0c… CERVEJARIA… cervejaria_petro… BR Brazil Brazil ITAPISSUMA
10 50e7284963b0… CERVEJARIA… cervejaria_cidad… BR Brazil Brazil FRUTAL
It seems like the website has duplicated data. You can see it, if you fetch the full data set and then use distinct -> it only exposes 1000 distinct observations. You can see where these duplicates occur if we add the page to each row.
library(httr2)
library(purrr) # for map to unnest response list
library(dplyr) # for bind_rows
get_comp_data_from_all_pages <- \(max_pages = 40) {
get_page_data <- \(page_num) {
request_body <- list(
pageNumber = page_num,
industryPath = "beverage_manufacturing",
countryIsoTwoCode = "br"
)
cat("Fetching page", page_num, "...\n")
tryCatch({
res <- request("https://www.dnb.com/business-directory/api/companyinformation") |>
req_method("POST") |>
req_headers(
"Accept" = "application/json, text/plain, */*",
"Accept-Language" = "en,de-DE;q=0.9,de;q=0.8,en-US;q=0.7,zh-CN;q=0.6,zh;q=0.5",
"Cache-Control" = "no-cache",
"User-Agent" = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Referer" = "https://www.dnb.com/"
) |>
req_body_json(request_body) |>
req_throttle(rate = 2) |>
req_perform() |>
resp_body_json()
companies <- res[["companyInformationCompany"]]
if (length(companies) == 0) {
cat("No more data on page", page_num, "\n")
return(NULL)
}
companies |>
purrr::map(~ {
list(
company_name = .x$primaryName,
sales_revenue = .x$salesRevenue,
country = .x$primaryAddress$addressCountry$countryName,
city = .x$primaryAddress$addressLocality$name,
region = .x$primaryAddress$addressRegion$name,
postal_code = .x$primaryAddress$postalCode,
street = .x$primaryAddress$streetAddress$line1,
page = page_num
)
}) |>
dplyr::bind_rows()
}, error = function(e) {
cat("Error on page", page_num, ":", e$message, "\n")
return(NULL)
})
}
# iterate function over pages
all_data <- map(1:max_pages, get_page_data) |>
compact() |> # Remove NULL results
bind_rows()
}
res <- get_comp_data_from_all_pages()
res2 <- res %>% distinct() # only gives 1000 rows

It makes me question their data quality and if these 14,387 records really exist.
Add 1
If you want to scale this up and retrieve more data from companies in different countries, you can rewrite the function as follows. It now obtains the totalPages from the first request and then uses it as max_pages.
library(httr2)
library(dplyr) # for bind_rows
get_comp_data_from_all_pages <- \(industryPath = "beverage_manufacturing",
countryIsoTwoCode = "br",
request_rate = 120/60) {
# Common request configuration
make_request <- \(page_num) {
request("https://www.dnb.com/business-directory/api/companyinformation") |>
req_method("POST") |>
req_headers(
"Accept" = "application/json, text/plain, */*",
"User-Agent" = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Referer" = "https://www.dnb.com/"
) |>
req_body_json(list(
pageNumber = page_num,
industryPath = industryPath,
countryIsoTwoCode = countryIsoTwoCode
)) |>
req_throttle(rate = request_rate) |>
req_perform() |>
resp_body_json()
}
# Get total pages
max_pages <- make_request(1)$totalPages
stopifnot(length(max_pages) != 0)
# Fetch all pages
map(1:max_pages, \(page) {
cat("Fetching page", page, "/", max_pages, "\n")
tryCatch({
res <- make_request(page)
companies <- res[["companyInformationCompany"]]
if (length(companies) == 0) return(NULL)
companies |>
lapply(rapply, f = c) |>
bind_rows()
}, error = \(e) {
cat("Error on page", page, ":", e$message, "\n")
NULL
})
}) |>
compact() |>
bind_rows()
}
beverage_manufactories_brazil <- get_comp_data_from_all_pages(
industryPath = "beverage_manufacturing",
countryIsoTwoCode = "br")