Skip to main content
deleted 50 characters in body
Source Link
lailaps
  • 11.3k
  • 1
  • 6
  • 25
library(httr2)
library(purrr) # for map to unnest response list
library(dplyr) # for bind_rows

get_comp_data_from_all_pages <- \(industryPath = "beverage_manufacturing", 
                                  countryIsoTwoCode = "br", 
                                  request_rate = 120/60) {
  
  # Common request configuration
  make_request <- \(page_num) {
    request("https://www.dnb.com/business-directory/api/companyinformation") |>
      req_method("POST") |>
      req_headers(
        "Accept" = "application/json, text/plain, */*",
        "User-Agent" = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Referer" = "https://www.dnb.com/"
      ) |>
      req_body_json(list(
        pageNumber = page_num,
        industryPath = industryPath,
        countryIsoTwoCode = countryIsoTwoCode
      )) |>
      req_throttle(rate = request_rate) |>
      req_perform() |>
      resp_body_json()
  }
  
  # Get total pages
  max_pages <- make_request(1)$totalPages
  stopifnot(length(max_pages) != 0)
  
  # Fetch all pages
  map(1:max_pages, \(page) {
    cat("Fetching page", page, "/", max_pages, "\n")
    tryCatch({
      res <- make_request(page)
      companies <- res[["companyInformationCompany"]]
      if (length(companies) == 0) return(NULL)
      
      companies |>
        lapply(rapply, f = c) |>
        bind_rows()
    }, error = \(e) {
      cat("Error on page", page, ":", e$message, "\n")
      NULL
    })
  }) |>
    compact() |>
    bind_rows()
}

beverage_manufactories_brazil <- get_comp_data_from_all_pages(
  industryPath = "beverage_manufacturing", 
  countryIsoTwoCode = "br")
library(httr2)
library(purrr) # for map to unnest response list
library(dplyr) # for bind_rows

get_comp_data_from_all_pages <- \(industryPath = "beverage_manufacturing", 
                                  countryIsoTwoCode = "br", 
                                  request_rate = 120/60) {
  
  # Common request configuration
  make_request <- \(page_num) {
    request("https://www.dnb.com/business-directory/api/companyinformation") |>
      req_method("POST") |>
      req_headers(
        "Accept" = "application/json, text/plain, */*",
        "User-Agent" = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Referer" = "https://www.dnb.com/"
      ) |>
      req_body_json(list(
        pageNumber = page_num,
        industryPath = industryPath,
        countryIsoTwoCode = countryIsoTwoCode
      )) |>
      req_throttle(rate = request_rate) |>
      req_perform() |>
      resp_body_json()
  }
  
  # Get total pages
  max_pages <- make_request(1)$totalPages
  stopifnot(length(max_pages) != 0)
  
  # Fetch all pages
  map(1:max_pages, \(page) {
    cat("Fetching page", page, "/", max_pages, "\n")
    tryCatch({
      res <- make_request(page)
      companies <- res[["companyInformationCompany"]]
      if (length(companies) == 0) return(NULL)
      
      companies |>
        lapply(rapply, f = c) |>
        bind_rows()
    }, error = \(e) {
      cat("Error on page", page, ":", e$message, "\n")
      NULL
    })
  }) |>
    compact() |>
    bind_rows()
}

beverage_manufactories_brazil <- get_comp_data_from_all_pages(
  industryPath = "beverage_manufacturing", 
  countryIsoTwoCode = "br")
library(httr2)
library(dplyr) # for bind_rows

get_comp_data_from_all_pages <- \(industryPath = "beverage_manufacturing", 
                                  countryIsoTwoCode = "br", 
                                  request_rate = 120/60) {
  
  # Common request configuration
  make_request <- \(page_num) {
    request("https://www.dnb.com/business-directory/api/companyinformation") |>
      req_method("POST") |>
      req_headers(
        "Accept" = "application/json, text/plain, */*",
        "User-Agent" = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Referer" = "https://www.dnb.com/"
      ) |>
      req_body_json(list(
        pageNumber = page_num,
        industryPath = industryPath,
        countryIsoTwoCode = countryIsoTwoCode
      )) |>
      req_throttle(rate = request_rate) |>
      req_perform() |>
      resp_body_json()
  }
  
  # Get total pages
  max_pages <- make_request(1)$totalPages
  stopifnot(length(max_pages) != 0)
  
  # Fetch all pages
  map(1:max_pages, \(page) {
    cat("Fetching page", page, "/", max_pages, "\n")
    tryCatch({
      res <- make_request(page)
      companies <- res[["companyInformationCompany"]]
      if (length(companies) == 0) return(NULL)
      
      companies |>
        lapply(rapply, f = c) |>
        bind_rows()
    }, error = \(e) {
      cat("Error on page", page, ":", e$message, "\n")
      NULL
    })
  }) |>
    compact() |>
    bind_rows()
}

beverage_manufactories_brazil <- get_comp_data_from_all_pages(
  industryPath = "beverage_manufacturing", 
  countryIsoTwoCode = "br")
added 1941 characters in body
Source Link
lailaps
  • 11.3k
  • 1
  • 6
  • 25

Add 1

If you want to scale this up and retrieve more data from companies in different countries, you can rewrite the function as follows. It now obtains the totalPages from the first request and then uses it as max_pages.

library(httr2)
library(purrr) # for map to unnest response list
library(dplyr) # for bind_rows

get_comp_data_from_all_pages <- \(industryPath = "beverage_manufacturing", 
                                  countryIsoTwoCode = "br", 
                                  request_rate = 120/60) {
  
  # Common request configuration
  make_request <- \(page_num) {
    request("https://www.dnb.com/business-directory/api/companyinformation") |>
      req_method("POST") |>
      req_headers(
        "Accept" = "application/json, text/plain, */*",
        "User-Agent" = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Referer" = "https://www.dnb.com/"
      ) |>
      req_body_json(list(
        pageNumber = page_num,
        industryPath = industryPath,
        countryIsoTwoCode = countryIsoTwoCode
      )) |>
      req_throttle(rate = request_rate) |>
      req_perform() |>
      resp_body_json()
  }
  
  # Get total pages
  max_pages <- make_request(1)$totalPages
  stopifnot(length(max_pages) != 0)
  
  # Fetch all pages
  map(1:max_pages, \(page) {
    cat("Fetching page", page, "/", max_pages, "\n")
    tryCatch({
      res <- make_request(page)
      companies <- res[["companyInformationCompany"]]
      if (length(companies) == 0) return(NULL)
      
      companies |>
        lapply(rapply, f = c) |>
        bind_rows()
    }, error = \(e) {
      cat("Error on page", page, ":", e$message, "\n")
      NULL
    })
  }) |>
    compact() |>
    bind_rows()
}

beverage_manufactories_brazil <- get_comp_data_from_all_pages(
  industryPath = "beverage_manufacturing", 
  countryIsoTwoCode = "br")

Add 1

If you want to scale this up and retrieve more data from companies in different countries, you can rewrite the function as follows. It now obtains the totalPages from the first request and then uses it as max_pages.

library(httr2)
library(purrr) # for map to unnest response list
library(dplyr) # for bind_rows

get_comp_data_from_all_pages <- \(industryPath = "beverage_manufacturing", 
                                  countryIsoTwoCode = "br", 
                                  request_rate = 120/60) {
  
  # Common request configuration
  make_request <- \(page_num) {
    request("https://www.dnb.com/business-directory/api/companyinformation") |>
      req_method("POST") |>
      req_headers(
        "Accept" = "application/json, text/plain, */*",
        "User-Agent" = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Referer" = "https://www.dnb.com/"
      ) |>
      req_body_json(list(
        pageNumber = page_num,
        industryPath = industryPath,
        countryIsoTwoCode = countryIsoTwoCode
      )) |>
      req_throttle(rate = request_rate) |>
      req_perform() |>
      resp_body_json()
  }
  
  # Get total pages
  max_pages <- make_request(1)$totalPages
  stopifnot(length(max_pages) != 0)
  
  # Fetch all pages
  map(1:max_pages, \(page) {
    cat("Fetching page", page, "/", max_pages, "\n")
    tryCatch({
      res <- make_request(page)
      companies <- res[["companyInformationCompany"]]
      if (length(companies) == 0) return(NULL)
      
      companies |>
        lapply(rapply, f = c) |>
        bind_rows()
    }, error = \(e) {
      cat("Error on page", page, ":", e$message, "\n")
      NULL
    })
  }) |>
    compact() |>
    bind_rows()
}

beverage_manufactories_brazil <- get_comp_data_from_all_pages(
  industryPath = "beverage_manufacturing", 
  countryIsoTwoCode = "br")
added 2454 characters in body
Source Link
lailaps
  • 11.3k
  • 1
  • 6
  • 25

It seems like the website has duplicated data. You can see it, if you fetch the full data set and then use distinct -> it only exposes 1000 distinct observations. You can see where these duplicates occur if we add the page to each row.

library(httr2)
library(purrr) # for map to unnest response list
library(dplyr) # for bind_rows

get_comp_data_from_all_pages <- \(max_pages = 40) {
  get_page_data <- \(page_num) {
    request_body <- list(
      pageNumber = page_num,
      industryPath = "beverage_manufacturing",
      countryIsoTwoCode = "br"
    )
    cat("Fetching page", page_num, "...\n")  
    tryCatch({
      res <- request("https://www.dnb.com/business-directory/api/companyinformation") |>
        req_method("POST") |>
        req_headers(
          "Accept" = "application/json, text/plain, */*",
          "Accept-Language" = "en,de-DE;q=0.9,de;q=0.8,en-US;q=0.7,zh-CN;q=0.6,zh;q=0.5",
          "Cache-Control" = "no-cache",
          "User-Agent" = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
          "Referer" = "https://www.dnb.com/"
        ) |>
        req_body_json(request_body) |>
        req_throttle(rate = 2) |>  
        req_perform() |> 
        resp_body_json()
      
      companies <- res[["companyInformationCompany"]]
      if (length(companies) == 0) {
        cat("No more data on page", page_num, "\n")
        return(NULL)
      }
      
      companies |> 
        purrr::map(~ {
          list(
            company_name = .x$primaryName,
            sales_revenue = .x$salesRevenue,
            country = .x$primaryAddress$addressCountry$countryName,
            city = .x$primaryAddress$addressLocality$name,
            region = .x$primaryAddress$addressRegion$name,
            postal_code = .x$primaryAddress$postalCode,
            street = .x$primaryAddress$streetAddress$line1,
            page = page_num
          )
        }) |> 
        dplyr::bind_rows()
    }, error = function(e) {
      cat("Error on page", page_num, ":", e$message, "\n")
      return(NULL)
    })
  }
  # iterate function over pages
  all_data <- map(1:max_pages, get_page_data) |>
    compact() |>  # Remove NULL results
    bind_rows()
}

res <- get_comp_data_from_all_pages()

res2 <- res %>% distinct() # only gives 1000 rows

duplicate data

It makes me question their data quality and if these 14,387 records really exist.

It seems like the website has duplicated data. You can see it, if you fetch the full data set and then use distinct -> it only exposes 1000 distinct observations. You can see where these duplicates occur if we add the page to each row.

library(httr2)
library(purrr) # for map to unnest response list
library(dplyr) # for bind_rows

get_comp_data_from_all_pages <- \(max_pages = 40) {
  get_page_data <- \(page_num) {
    request_body <- list(
      pageNumber = page_num,
      industryPath = "beverage_manufacturing",
      countryIsoTwoCode = "br"
    )
    cat("Fetching page", page_num, "...\n")  
    tryCatch({
      res <- request("https://www.dnb.com/business-directory/api/companyinformation") |>
        req_method("POST") |>
        req_headers(
          "Accept" = "application/json, text/plain, */*",
          "Accept-Language" = "en,de-DE;q=0.9,de;q=0.8,en-US;q=0.7,zh-CN;q=0.6,zh;q=0.5",
          "Cache-Control" = "no-cache",
          "User-Agent" = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
          "Referer" = "https://www.dnb.com/"
        ) |>
        req_body_json(request_body) |>
        req_throttle(rate = 2) |>  
        req_perform() |> 
        resp_body_json()
      
      companies <- res[["companyInformationCompany"]]
      if (length(companies) == 0) {
        cat("No more data on page", page_num, "\n")
        return(NULL)
      }
      
      companies |> 
        purrr::map(~ {
          list(
            company_name = .x$primaryName,
            sales_revenue = .x$salesRevenue,
            country = .x$primaryAddress$addressCountry$countryName,
            city = .x$primaryAddress$addressLocality$name,
            region = .x$primaryAddress$addressRegion$name,
            postal_code = .x$primaryAddress$postalCode,
            street = .x$primaryAddress$streetAddress$line1,
            page = page_num
          )
        }) |> 
        dplyr::bind_rows()
    }, error = function(e) {
      cat("Error on page", page_num, ":", e$message, "\n")
      return(NULL)
    })
  }
  # iterate function over pages
  all_data <- map(1:max_pages, get_page_data) |>
    compact() |>  # Remove NULL results
    bind_rows()
}

res <- get_comp_data_from_all_pages()

res2 <- res %>% distinct() # only gives 1000 rows

duplicate data

It makes me question their data quality and if these 14,387 records really exist.

deleted 382 characters in body
Source Link
lailaps
  • 11.3k
  • 1
  • 6
  • 25
Loading
added 1 character in body
Source Link
lailaps
  • 11.3k
  • 1
  • 6
  • 25
Loading
added 36 characters in body
Source Link
lailaps
  • 11.3k
  • 1
  • 6
  • 25
Loading
added 1723 characters in body
Source Link
lailaps
  • 11.3k
  • 1
  • 6
  • 25
Loading
deleted 2045 characters in body
Source Link
lailaps
  • 11.3k
  • 1
  • 6
  • 25
Loading
Source Link
lailaps
  • 11.3k
  • 1
  • 6
  • 25
Loading