Revisions to Extract tables from website with dynamic content with R

deleted 50 characters in body

Source Link

edited Aug 26 at 18:35

11.3k
1
6
25

library(httr2)
library(purrr) # for map to unnest response list
library(dplyr) # for bind_rows

get_comp_data_from_all_pages <- \(industryPath = "beverage_manufacturing", 
                                  countryIsoTwoCode = "br", 
                                  request_rate = 120/60) {
  
  # Common request configuration
  make_request <- \(page_num) {
    request("https://www.dnb.com/business-directory/api/companyinformation") |>
      req_method("POST") |>
      req_headers(
        "Accept" = "application/json, text/plain, */*",
        "User-Agent" = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Referer" = "https://www.dnb.com/"
      ) |>
      req_body_json(list(
        pageNumber = page_num,
        industryPath = industryPath,
        countryIsoTwoCode = countryIsoTwoCode
      )) |>
      req_throttle(rate = request_rate) |>
      req_perform() |>
      resp_body_json()
  }
  
  # Get total pages
  max_pages <- make_request(1)$totalPages
  stopifnot(length(max_pages) != 0)
  
  # Fetch all pages
  map(1:max_pages, \(page) {
    cat("Fetching page", page, "/", max_pages, "\n")
    tryCatch({
      res <- make_request(page)
      companies <- res[["companyInformationCompany"]]
      if (length(companies) == 0) return(NULL)
      
      companies |>
        lapply(rapply, f = c) |>
        bind_rows()
    }, error = \(e) {
      cat("Error on page", page, ":", e$message, "\n")
      NULL
    })
  }) |>
    compact() |>
    bind_rows()
}

beverage_manufactories_brazil <- get_comp_data_from_all_pages(
  industryPath = "beverage_manufacturing", 
  countryIsoTwoCode = "br")

library(httr2)
library(purrr) # for map to unnest response list
library(dplyr) # for bind_rows

get_comp_data_from_all_pages <- \(industryPath = "beverage_manufacturing", 
                                  countryIsoTwoCode = "br", 
                                  request_rate = 120/60) {
  
  # Common request configuration
  make_request <- \(page_num) {
    request("https://www.dnb.com/business-directory/api/companyinformation") |>
      req_method("POST") |>
      req_headers(
        "Accept" = "application/json, text/plain, */*",
        "User-Agent" = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Referer" = "https://www.dnb.com/"
      ) |>
      req_body_json(list(
        pageNumber = page_num,
        industryPath = industryPath,
        countryIsoTwoCode = countryIsoTwoCode
      )) |>
      req_throttle(rate = request_rate) |>
      req_perform() |>
      resp_body_json()
  }
  
  # Get total pages
  max_pages <- make_request(1)$totalPages
  stopifnot(length(max_pages) != 0)
  
  # Fetch all pages
  map(1:max_pages, \(page) {
    cat("Fetching page", page, "/", max_pages, "\n")
    tryCatch({
      res <- make_request(page)
      companies <- res[["companyInformationCompany"]]
      if (length(companies) == 0) return(NULL)
      
      companies |>
        lapply(rapply, f = c) |>
        bind_rows()
    }, error = \(e) {
      cat("Error on page", page, ":", e$message, "\n")
      NULL
    })
  }) |>
    compact() |>
    bind_rows()
}

beverage_manufactories_brazil <- get_comp_data_from_all_pages(
  industryPath = "beverage_manufacturing", 
  countryIsoTwoCode = "br")

library(httr2)
library(dplyr) # for bind_rows

get_comp_data_from_all_pages <- \(industryPath = "beverage_manufacturing", 
                                  countryIsoTwoCode = "br", 
                                  request_rate = 120/60) {
  
  # Common request configuration
  make_request <- \(page_num) {
    request("https://www.dnb.com/business-directory/api/companyinformation") |>
      req_method("POST") |>
      req_headers(
        "Accept" = "application/json, text/plain, */*",
        "User-Agent" = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Referer" = "https://www.dnb.com/"
      ) |>
      req_body_json(list(
        pageNumber = page_num,
        industryPath = industryPath,
        countryIsoTwoCode = countryIsoTwoCode
      )) |>
      req_throttle(rate = request_rate) |>
      req_perform() |>
      resp_body_json()
  }
  
  # Get total pages
  max_pages <- make_request(1)$totalPages
  stopifnot(length(max_pages) != 0)
  
  # Fetch all pages
  map(1:max_pages, \(page) {
    cat("Fetching page", page, "/", max_pages, "\n")
    tryCatch({
      res <- make_request(page)
      companies <- res[["companyInformationCompany"]]
      if (length(companies) == 0) return(NULL)
      
      companies |>
        lapply(rapply, f = c) |>
        bind_rows()
    }, error = \(e) {
      cat("Error on page", page, ":", e$message, "\n")
      NULL
    })
  }) |>
    compact() |>
    bind_rows()
}

beverage_manufactories_brazil <- get_comp_data_from_all_pages(
  industryPath = "beverage_manufacturing", 
  countryIsoTwoCode = "br")

added 1941 characters in body

Source Link

edited Aug 26 at 9:09

lailaps

11.3k
1
6
25

Add 1

If you want to scale this up and retrieve more data from companies in different countries, you can rewrite the function as follows. It now obtains the totalPages from the first request and then uses it as max_pages.

library(httr2)
library(purrr) # for map to unnest response list
library(dplyr) # for bind_rows

get_comp_data_from_all_pages <- \(industryPath = "beverage_manufacturing", 
                                  countryIsoTwoCode = "br", 
                                  request_rate = 120/60) {
  
  # Common request configuration
  make_request <- \(page_num) {
    request("https://www.dnb.com/business-directory/api/companyinformation") |>
      req_method("POST") |>
      req_headers(
        "Accept" = "application/json, text/plain, */*",
        "User-Agent" = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Referer" = "https://www.dnb.com/"
      ) |>
      req_body_json(list(
        pageNumber = page_num,
        industryPath = industryPath,
        countryIsoTwoCode = countryIsoTwoCode
      )) |>
      req_throttle(rate = request_rate) |>
      req_perform() |>
      resp_body_json()
  }
  
  # Get total pages
  max_pages <- make_request(1)$totalPages
  stopifnot(length(max_pages) != 0)
  
  # Fetch all pages
  map(1:max_pages, \(page) {
    cat("Fetching page", page, "/", max_pages, "\n")
    tryCatch({
      res <- make_request(page)
      companies <- res[["companyInformationCompany"]]
      if (length(companies) == 0) return(NULL)
      
      companies |>
        lapply(rapply, f = c) |>
        bind_rows()
    }, error = \(e) {
      cat("Error on page", page, ":", e$message, "\n")
      NULL
    })
  }) |>
    compact() |>
    bind_rows()
}

beverage_manufactories_brazil <- get_comp_data_from_all_pages(
  industryPath = "beverage_manufacturing", 
  countryIsoTwoCode = "br")

Add 1

If you want to scale this up and retrieve more data from companies in different countries, you can rewrite the function as follows. It now obtains the totalPages from the first request and then uses it as max_pages.

library(httr2)
library(purrr) # for map to unnest response list
library(dplyr) # for bind_rows

get_comp_data_from_all_pages <- \(industryPath = "beverage_manufacturing", 
                                  countryIsoTwoCode = "br", 
                                  request_rate = 120/60) {
  
  # Common request configuration
  make_request <- \(page_num) {
    request("https://www.dnb.com/business-directory/api/companyinformation") |>
      req_method("POST") |>
      req_headers(
        "Accept" = "application/json, text/plain, */*",
        "User-Agent" = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Referer" = "https://www.dnb.com/"
      ) |>
      req_body_json(list(
        pageNumber = page_num,
        industryPath = industryPath,
        countryIsoTwoCode = countryIsoTwoCode
      )) |>
      req_throttle(rate = request_rate) |>
      req_perform() |>
      resp_body_json()
  }
  
  # Get total pages
  max_pages <- make_request(1)$totalPages
  stopifnot(length(max_pages) != 0)
  
  # Fetch all pages
  map(1:max_pages, \(page) {
    cat("Fetching page", page, "/", max_pages, "\n")
    tryCatch({
      res <- make_request(page)
      companies <- res[["companyInformationCompany"]]
      if (length(companies) == 0) return(NULL)
      
      companies |>
        lapply(rapply, f = c) |>
        bind_rows()
    }, error = \(e) {
      cat("Error on page", page, ":", e$message, "\n")
      NULL
    })
  }) |>
    compact() |>
    bind_rows()
}

beverage_manufactories_brazil <- get_comp_data_from_all_pages(
  industryPath = "beverage_manufacturing", 
  countryIsoTwoCode = "br")

added 2454 characters in body

Source Link

edited Aug 25 at 17:21

lailaps

11.3k
1
6
25

It seems like the website has duplicated data. You can see it, if you fetch the full data set and then use distinct -> it only exposes 1000 distinct observations. You can see where these duplicates occur if we add the page to each row.

library(httr2)
library(purrr) # for map to unnest response list
library(dplyr) # for bind_rows

get_comp_data_from_all_pages <- \(max_pages = 40) {
  get_page_data <- \(page_num) {
    request_body <- list(
      pageNumber = page_num,
      industryPath = "beverage_manufacturing",
      countryIsoTwoCode = "br"
    )
    cat("Fetching page", page_num, "...\n")  
    tryCatch({
      res <- request("https://www.dnb.com/business-directory/api/companyinformation") |>
        req_method("POST") |>
        req_headers(
          "Accept" = "application/json, text/plain, */*",
          "Accept-Language" = "en,de-DE;q=0.9,de;q=0.8,en-US;q=0.7,zh-CN;q=0.6,zh;q=0.5",
          "Cache-Control" = "no-cache",
          "User-Agent" = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
          "Referer" = "https://www.dnb.com/"
        ) |>
        req_body_json(request_body) |>
        req_throttle(rate = 2) |>  
        req_perform() |> 
        resp_body_json()
      
      companies <- res[["companyInformationCompany"]]
      if (length(companies) == 0) {
        cat("No more data on page", page_num, "\n")
        return(NULL)
      }
      
      companies |> 
        purrr::map(~ {
          list(
            company_name = .x$primaryName,
            sales_revenue = .x$salesRevenue,
            country = .x$primaryAddress$addressCountry$countryName,
            city = .x$primaryAddress$addressLocality$name,
            region = .x$primaryAddress$addressRegion$name,
            postal_code = .x$primaryAddress$postalCode,
            street = .x$primaryAddress$streetAddress$line1,
            page = page_num
          )
        }) |> 
        dplyr::bind_rows()
    }, error = function(e) {
      cat("Error on page", page_num, ":", e$message, "\n")
      return(NULL)
    })
  }
  # iterate function over pages
  all_data <- map(1:max_pages, get_page_data) |>
    compact() |>  # Remove NULL results
    bind_rows()
}

res <- get_comp_data_from_all_pages()

res2 <- res %>% distinct() # only gives 1000 rows

It makes me question their data quality and if these 14,387 records really exist.

It seems like the website has duplicated data. You can see it, if you fetch the full data set and then use distinct -> it only exposes 1000 distinct observations. You can see where these duplicates occur if we add the page to each row.

library(httr2)
library(purrr) # for map to unnest response list
library(dplyr) # for bind_rows

get_comp_data_from_all_pages <- \(max_pages = 40) {
  get_page_data <- \(page_num) {
    request_body <- list(
      pageNumber = page_num,
      industryPath = "beverage_manufacturing",
      countryIsoTwoCode = "br"
    )
    cat("Fetching page", page_num, "...\n")  
    tryCatch({
      res <- request("https://www.dnb.com/business-directory/api/companyinformation") |>
        req_method("POST") |>
        req_headers(
          "Accept" = "application/json, text/plain, */*",
          "Accept-Language" = "en,de-DE;q=0.9,de;q=0.8,en-US;q=0.7,zh-CN;q=0.6,zh;q=0.5",
          "Cache-Control" = "no-cache",
          "User-Agent" = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
          "Referer" = "https://www.dnb.com/"
        ) |>
        req_body_json(request_body) |>
        req_throttle(rate = 2) |>  
        req_perform() |> 
        resp_body_json()
      
      companies <- res[["companyInformationCompany"]]
      if (length(companies) == 0) {
        cat("No more data on page", page_num, "\n")
        return(NULL)
      }
      
      companies |> 
        purrr::map(~ {
          list(
            company_name = .x$primaryName,
            sales_revenue = .x$salesRevenue,
            country = .x$primaryAddress$addressCountry$countryName,
            city = .x$primaryAddress$addressLocality$name,
            region = .x$primaryAddress$addressRegion$name,
            postal_code = .x$primaryAddress$postalCode,
            street = .x$primaryAddress$streetAddress$line1,
            page = page_num
          )
        }) |> 
        dplyr::bind_rows()
    }, error = function(e) {
      cat("Error on page", page_num, ":", e$message, "\n")
      return(NULL)
    })
  }
  # iterate function over pages
  all_data <- map(1:max_pages, get_page_data) |>
    compact() |>  # Remove NULL results
    bind_rows()
}

res <- get_comp_data_from_all_pages()

res2 <- res %>% distinct() # only gives 1000 rows

It makes me question their data quality and if these 14,387 records really exist.

deleted 382 characters in body

Source Link

edited Aug 25 at 17:11

lailaps

11.3k
1
6
25

Loading

added 1 character in body

Source Link

edited Aug 25 at 16:53

lailaps

11.3k
1
6
25

Loading

added 36 characters in body

Source Link

edited Aug 25 at 10:27

lailaps

11.3k
1
6
25

Loading

added 1723 characters in body

Source Link

edited Aug 25 at 8:54

lailaps

11.3k
1
6
25

Loading

deleted 2045 characters in body

Source Link

edited Aug 25 at 8:48

lailaps

11.3k
1
6
25

Loading

Source Link

answered Aug 25 at 6:34

lailaps

11.3k
1
6
25

Loading

Collectives™ on Stack Overflow

Return to Answer

Add 1

Add 1