Revisions to Extracting html table and turn into tibble or data.frame in R

deleted 14 characters in body

Source Link

edited Sep 6 at 8:14

11.3k
1
6
25

library(rvest)

# get rows 'li' of table to iterate over them
rows <- read_html("https://gainblers.com/mx/quinielas/progol-revancha/", encoding = "UTF-8") |>
  html_element(xpath= '//*[@id="content_seccionb"]/div[1]/ul') |>
  html_nodes("li") 

# helper function to get the text from a nodes child found by xpath
from_xpath <- \(x, path) x |> html_element(xpath = path) |> html_text(trim = TRUE)
 
foo# <-@margusl rowscorrectly |>
pointed purrr::map_dfout, that "from_xpath(~)" {
is already vectorized listand can directly be applied to "rows"

foo <- data.frame(
      nr =        from_xpath(.xrows, "div[1]/span"),
      partidos1 = from_xpath(.xrows, "div[1]/p/span[1]"), 
      partidos2 = from_xpath(.xrows, "div[1]/p/span[3]"),
      L1 =        from_xpath(.xrows, "div[2]/span"),
      L2 =        from_xpath(.xrows, "div[2]/strong"),
      E1 =        from_xpath(.xrows, "div[3]/span"),
      E2 =        from_xpath(.xrows, "div[3]/strong"),
      V1 =        from_xpath(.xrows, "div[4]/span"),
      V2 =        from_xpath(.xrows, "div[4]/strong"),
      pron1 =     from_xpath(.xrows, "div[5]/div[1]"),
      pron2 =     from_xpath(.xrows, "div[5]/div[2]")
   )
 }) |> 
 data.frame() |>
 subset(!is.na(partidos1)) # filter out header row

<li class="tr quiniela-tr">
    <div class="td flex7 td-event-with-calendar">
        <span class="m-none">1&nbsp;&nbsp;&nbsp; -- from_xpath(.xrow, "div[1]/span")
        </span>
        <p class="event">
            <a class="linkForzoso" href="/es/apuestas/futbol/internacional/amistosos/mexico-japon/">
            <span> 
            México -- from_xpath(.xrow, "div[1]/p/span[1]")
            </span>
            <span class="vs">
            vs -- I skipped this one because it's just "vs"
            </span>
            <span>
            Japón -- from_xpath(.xrow, "div[1]/p/span[3]")
            </span>
            </a>
        </p>
    </div>
    <div class="td flex2 f-row">
      <span class="cuotita in-event no-link">2,40</span> -- from_xpath(.xrow, "div[2]/span")
      <strong class="counter">39%</strong></div> -- from_xpath(.xrow, "div[2]/strong")
    <div class="td flex2 f-row">
      <span class="cuotita in-event no-link">3,50</span> -- from_xpath(.xrow, "div[3]/span")
      <strong class="counter">27%</strong> -- from_xpath(.xrow, "div[3]/strong")
    </div>
    <div class="td flex2 f-row">
      <span class="cuotita in-event no-link">2,80</span>  -- from_xpath(.xrow, "div[4]/span")
      <strong class="counter">34%</strong> -- from_xpath(.xrow, "div[4]/strong")
    </div>
    <div class="td flex2 f-row">
        <div class="grupo-casilla">L</div> -- from_xpath(.xrow, "div[5]/div[1]")
        <div class="grupo-casilla">V</div> -- from_xpath(.xrow, "div[5]/div[2]")
    </div>
</li>

And then I do this for all elements in rows and map_df to a data.frame.

library(rvest)

# get rows 'li' of table to iterate over them
rows <- read_html("https://gainblers.com/mx/quinielas/progol-revancha/", encoding = "UTF-8") |>
  html_element(xpath= '//*[@id="content_seccionb"]/div[1]/ul') |>
  html_nodes("li") 

# helper function to get the text from a nodes child found by xpath
from_xpath <- \(x, path) x |> html_element(xpath = path) |> html_text(trim = TRUE)
 
foo <- rows |>
 purrr::map_df(~ {
   list(
     nr =        from_xpath(.x, "div[1]/span"),
     partidos1 = from_xpath(.x, "div[1]/p/span[1]"), 
     partidos2 = from_xpath(.x, "div[1]/p/span[3]"),
     L1 =        from_xpath(.x, "div[2]/span"),
     L2 =        from_xpath(.x, "div[2]/strong"),
     E1 =        from_xpath(.x, "div[3]/span"),
     E2 =        from_xpath(.x, "div[3]/strong"),
     V1 =        from_xpath(.x, "div[4]/span"),
     V2 =        from_xpath(.x, "div[4]/strong"),
     pron1 =     from_xpath(.x, "div[5]/div[1]"),
     pron2 =     from_xpath(.x, "div[5]/div[2]")
   )
 }) |> 
 data.frame() |>
 subset(!is.na(partidos1)) # filter out header row

<li class="tr quiniela-tr">
    <div class="td flex7 td-event-with-calendar">
        <span class="m-none">1&nbsp;&nbsp;&nbsp; -- from_xpath(.x, "div[1]/span")
        </span>
        <p class="event">
            <a class="linkForzoso" href="/es/apuestas/futbol/internacional/amistosos/mexico-japon/">
            <span> 
            México -- from_xpath(.x, "div[1]/p/span[1]")
            </span>
            <span class="vs">
            vs -- I skipped this one because it's just "vs"
            </span>
            <span>
            Japón -- from_xpath(.x, "div[1]/p/span[3]")
            </span>
            </a>
        </p>
    </div>
    <div class="td flex2 f-row">
      <span class="cuotita in-event no-link">2,40</span> -- from_xpath(.x, "div[2]/span")
      <strong class="counter">39%</strong></div> -- from_xpath(.x, "div[2]/strong")
    <div class="td flex2 f-row">
      <span class="cuotita in-event no-link">3,50</span> -- from_xpath(.x, "div[3]/span")
      <strong class="counter">27%</strong> -- from_xpath(.x, "div[3]/strong")
    </div>
    <div class="td flex2 f-row">
      <span class="cuotita in-event no-link">2,80</span>  -- from_xpath(.x, "div[4]/span")
      <strong class="counter">34%</strong> -- from_xpath(.x, "div[4]/strong")
    </div>
    <div class="td flex2 f-row">
        <div class="grupo-casilla">L</div> -- from_xpath(.x, "div[5]/div[1]")
        <div class="grupo-casilla">V</div> -- from_xpath(.x, "div[5]/div[2]")
    </div>
</li>

And then I do this for all elements in rows and map_df to a data.frame.

library(rvest)

# get rows 'li' of table to iterate over them
rows <- read_html("https://gainblers.com/mx/quinielas/progol-revancha/", encoding = "UTF-8") |>
  html_element(xpath= '//*[@id="content_seccionb"]/div[1]/ul') |>
  html_nodes("li") 

# helper function to get the text from a nodes child found by xpath
from_xpath <- \(x, path) x |> html_element(xpath = path) |> html_text(trim = TRUE)
# @margusl correctly pointed out, that "from_xpath()" is already vectorized and can directly be applied to "rows"

foo <- data.frame(
      nr =        from_xpath(rows, "div[1]/span"),
      partidos1 = from_xpath(rows, "div[1]/p/span[1]"), 
      partidos2 = from_xpath(rows, "div[1]/p/span[3]"),
      L1 =        from_xpath(rows, "div[2]/span"),
      L2 =        from_xpath(rows, "div[2]/strong"),
      E1 =        from_xpath(rows, "div[3]/span"),
      E2 =        from_xpath(rows, "div[3]/strong"),
      V1 =        from_xpath(rows, "div[4]/span"),
      V2 =        from_xpath(rows, "div[4]/strong"),
      pron1 =     from_xpath(rows, "div[5]/div[1]"),
      pron2 =     from_xpath(rows, "div[5]/div[2]")
    ) |> 
  subset(!is.na(partidos1)) # filter out header row

<li class="tr quiniela-tr">
    <div class="td flex7 td-event-with-calendar">
        <span class="m-none">1&nbsp;&nbsp;&nbsp; -- from_xpath(row, "div[1]/span")
        </span>
        <p class="event">
            <a class="linkForzoso" href="/es/apuestas/futbol/internacional/amistosos/mexico-japon/">
            <span> 
            México -- from_xpath(row, "div[1]/p/span[1]")
            </span>
            <span class="vs">
            vs -- I skipped this one because it's just "vs"
            </span>
            <span>
            Japón -- from_xpath(row, "div[1]/p/span[3]")
            </span>
            </a>
        </p>
    </div>
    <div class="td flex2 f-row">
      <span class="cuotita in-event no-link">2,40</span> -- from_xpath(row, "div[2]/span")
      <strong class="counter">39%</strong></div> -- from_xpath(row, "div[2]/strong")
    <div class="td flex2 f-row">
      <span class="cuotita in-event no-link">3,50</span> -- from_xpath(row, "div[3]/span")
      <strong class="counter">27%</strong> -- from_xpath(row, "div[3]/strong")
    </div>
    <div class="td flex2 f-row">
      <span class="cuotita in-event no-link">2,80</span>  -- from_xpath(row, "div[4]/span")
      <strong class="counter">34%</strong> -- from_xpath(row, "div[4]/strong")
    </div>
    <div class="td flex2 f-row">
        <div class="grupo-casilla">L</div> -- from_xpath(row, "div[5]/div[1]")
        <div class="grupo-casilla">V</div> -- from_xpath(row, "div[5]/div[2]")
    </div>
</li>

deleted 128 characters in body

Source Link

edited Sep 6 at 7:54

lailaps

11.3k
1
6
25

with this you save yourself the trouble of cleaning up the merged texts. For example 2,4239% equals two seperate column values like "2,42" and "39%" which is lost information in gb and you can't easily seperate them back4239%".

Answering your questioncomment-question

"//*[@id="content_seccionb"]/div[1]/ul/li[1]/div[1]" matches the whole div, so the text will be concatenated together like "MéxicovsJapón". "div[1]/p/span[1]" on the other hand matches "México", see the HTML structure of one <li[2]> element below.

I added the calls to from_xpath(.x, "") to make it clear which xpath corresponds to DOM-element.

with this you save yourself the trouble of cleaning up the merged texts. For example 2,4239% equals two seperate column values "2,42" and "39%" which is lost information in gb and you can't easily seperate them back.

Answering your question

"//*[@id="content_seccionb"]/div[1]/ul/li[1]/div[1]" matches the whole div, so the text will be concatenated together like "MéxicovsJapón". "div[1]/p/span[1]" on the other hand matches "México", see the HTML structure of one <li[2]> element below.

I added the calls to from_xpath(.x, "") to make it clear which xpath corresponds to DOM-element.

with this you save yourself the trouble of cleaning up the merged texts like "2,4239%".

Answering your comment-question

"//*[@id="content_seccionb"]/div[1]/ul/li[1]/div[1]" matches the whole div, so the text will be concatenated together like "MéxicovsJapón". "div[1]/p/span[1]" on the other hand matches "México", see the HTML structure of one <li[2]> element below. I added the calls to from_xpath(.x, "") to make it clear which xpath corresponds to DOM-element.

deleted 19 characters in body

Source Link

edited Sep 6 at 6:52

lailaps

11.3k
1
6
25

with this you save yourself the trouble of cleaning up the merged texts. For example 2,4239% equals two seperate column values "2,42" and "39%" which is lost information in gb and you can't easily seperate them back. Then this questions becomes Can I parse HTML with Regex? which just does not make sense here.

with this you save yourself the trouble of cleaning up the merged texts. For example 2,4239% equals two seperate column values "2,42" and "39%" which is lost information in gb and you can't easily seperate them back.

deleted 19 characters in body

Source Link

edited Sep 6 at 6:46

lailaps

11.3k
1
6
25

Loading

added 10 characters in body

Source Link

edited Sep 5 at 22:15

lailaps

11.3k
1
6
25

Loading

deleted 736 characters in body

Source Link

edited Sep 5 at 22:08

lailaps

11.3k
1
6
25

Loading

added 1309 characters in body

Source Link

edited Sep 5 at 22:01

lailaps

11.3k
1
6
25

Loading

added 1309 characters in body

Source Link

edited Sep 5 at 21:55

lailaps

11.3k
1
6
25

Loading

added 1309 characters in body

Source Link

edited Sep 5 at 21:40

lailaps

11.3k
1
6
25

Loading

added 94 characters in body

Source Link

edited Sep 5 at 20:50

lailaps

11.3k
1
6
25

Loading

Post Undeleted by lailaps

occurred Sep 5 at 20:20

Post Deleted by lailaps

occurred Sep 5 at 20:16

added 123 characters in body

Source Link

edited Sep 5 at 20:09

lailaps

11.3k
1
6
25

Loading

deleted 16 characters in body

Source Link

edited Sep 5 at 19:53

lailaps

11.3k
1
6
25

Loading

deleted 16 characters in body

Source Link

edited Sep 5 at 19:47

lailaps

11.3k
1
6
25

Loading

Post Undeleted by lailaps

occurred Sep 5 at 19:37

Post Deleted by lailaps

occurred Sep 5 at 19:37

added 260 characters in body

Source Link

edited Sep 5 at 19:34

lailaps

11.3k
1
6
25

Loading

Source Link

answered Sep 5 at 19:28

lailaps

11.3k
1
6
25

Loading

Collectives™ on Stack Overflow

Return to Answer

Answering your questioncomment-question

Answering your question

Answering your comment-question