Convert JSON file to a CSV file using R

Question

I have the JSON file in a .txt file which I'm trying to load into R but I'm getting the following error:

Error in feed_push_parser(readBin(con, raw(), n), reset = TRUE) : 
  parse error: trailing garbage
      " : "SUCCESS"  }    /* 1 */  {    "_id" : "b736c374-b8ae-4e9
                 (right here) ------^

I'm assuming the error is because of multiple instances of /* (number) */ and I can't manually remove them all as my file has 10k instances of these. Is there a way to remove such instances before loading the data into R?

My JSON file looks like below:

/* 0 */
{
  "_id" : "93ccbdb6-8947",
  "uiSearchRequest" : {
    "travelDate" : 20151206,
    "travelDuration" : 7,
    "shopperDuration" : 30,
    "oneWay" : false,
    "userId" : "ATP1KKP",
    "queryId" : "93ccbdb6-8947",
    "subRequests" : [{
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "AA",
        "fareClasses" : "",
        "owrt" : "1,2"
      }]
  },
  "downloadCount" : 0,
  "requestDate" : 20151205,
  "totalRecords" : 0,
  "status" : "SUCCESS"
}

/* 1 */
{
  "_id" : "b736c374-b8ae",
  "uiSearchRequest" : {
    "travelDate" : 20151206,
    "travelDuration" : 7,
    "shopperDuration" : 30,
    "oneWay" : false,
    "userId" : "ATP1KKP",
    "queryId" : "b736c374-b8ae",
    "subRequests" : [{
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "AA",
        "fareClasses" : "",
        "owrt" : "1,2"
      }]
  },
  "downloadCount" : 0,
  "requestDate" : 20151205,
  "totalRecords" : 0,
  "status" : "SUCCESS"
}

/* 2 */
{
  "_id" : "3312605f-8304",
  "uiSearchRequest" : {
    "travelDate" : 20151206,
    "travelDuration" : 7,
    "shopperDuration" : 30,
    "oneWay" : false,
    "userId" : "ATP1SXE",
    "queryId" : "3312605f-8304",
    "subRequests" : [{
        "origin" : "LON",
        "destination" : "IAD",
        "carrier" : "AA",
        "fareClasses" : "",
        "owrt" : "1,2"
      }]
  },
  "downloadCount" : 2,
  "requestDate" : 20151205,
  "totalRecords" : 0,
  "status" : "SUCCESS"
}

/* 3 */
{
  "_id" : "6b668cfa-9b79",
  "uiSearchRequest" : {
    "travelDate" : 20151206,
    "travelDuration" : 7,
    "shopperDuration" : 30,
    "oneWay" : false,
    "userId" : "ATP1NXA",
    "queryId" : "6b668cfa-9b79",
    "subRequests" : [{
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "AA",
        "fareClasses" : "",
        "owrt" : "1,2"
      }]
  },
  "downloadCount" : 1,
  "requestDate" : 20151205,
  "totalRecords" : 1388,
  "status" : "SUCCESS"
}

/* 4 */
{
  "_id" : "41c373a1-e4cb",
  "uiSearchRequest" : {
    "travelDate" : 20151206,
    "travelDuration" : 7,
    "shopperDuration" : 30,
    "oneWay" : false,
    "userId" : "ATP6CXS",
    "queryId" : "41c373a1-e4cb",
    "subRequests" : [{
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "AA",
        "fareClasses" : "",
        "owrt" : "1,2"
      }]
  },
  "downloadCount" : 0,
  "requestDate" : 20151205,
  "totalRecords" : 1388,
  "status" : "SUCCESS"
}

/* 5 */
{
  "_id" : "2c8331c4-21ca",
  "uiSearchRequest" : {
    "travelDate" : 20151206,
    "travelDuration" : 7,
    "shopperDuration" : 30,
    "oneWay" : false,
    "userId" : "ATP1KKP",
    "queryId" : "2c8331c4-21ca",
    "subRequests" : [{
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "AA",
        "fareClasses" : "",
        "owrt" : "1,2"
      }]
  },
  "downloadCount" : 0,
  "requestDate" : 20151205,
  "totalRecords" : 1388,
  "status" : "SUCCESS"
}

/* 6 */
{
  "_id" : "71a09900-1c13",
  "uiSearchRequest" : {
    "travelDate" : 20151206,
    "travelDuration" : 7,
    "shopperDuration" : 30,
    "oneWay" : false,
    "userId" : "ATP6CXS",
    "queryId" : "71a09900-1c13",
    "subRequests" : [{
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "AF",
        "fareClasses" : "",
        "owrt" : "1,2"
      }, {
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "AA",
        "fareClasses" : "",
        "owrt" : "1,2"
      }, {
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "DL",
        "fareClasses" : "",
        "owrt" : "1,2"
      }, {
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "LH",
        "fareClasses" : "",
        "owrt" : "1,2"
      }, {
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "BA",
        "fareClasses" : "",
        "owrt" : "1,2"
      }]
  },
  "downloadCount" : 0,
  "requestDate" : 20151205,
  "totalRecords" : 6941,
  "status" : "SUCCESS"
}

/* 7 */
{
  "_id" : "a036a42a-918b",
  "uiSearchRequest" : {
    "travelDate" : 20151206,
    "travelDuration" : 7,
    "shopperDuration" : 30,
    "oneWay" : false,
    "userId" : "ATP1MMM",
    "queryId" : "a036a42a-918b",
    "subRequests" : [{
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "AA",
        "fareClasses" : "",
        "owrt" : "1,2"
      }]
  },
  "downloadCount" : 0,
  "requestDate" : 20151205,
  "totalRecords" : 1388,
  "status" : "SUCCESS"
}

/* 8 */
{
  "_id" : "c547be36-805c",
  "uiSearchRequest" : {
    "travelDate" : 20151206,
    "travelDuration" : 7,
    "shopperDuration" : 30,
    "oneWay" : false,
    "userId" : "ATP1SXB",
    "queryId" : "c547be36-805c",
    "subRequests" : [{
        "origin" : "CHI",
        "destination" : "LON",
        "carrier" : "BA",
        "fareClasses" : "",
        "owrt" : "1,2"
      }]
  },
  "downloadCount" : 2,
  "requestDate" : 20151205,
  "totalRecords" : 1072,
  "status" : "SUCCESS"
}

My code is below (Although I haven't gotten much far):

library(jsonlite)
library(RJSONIO)
json_data_raw<-fromJSON("mydata.txt")

json_file <- lapply(json_data_raw, function(x) {
  x[sapply(x, is.null)] <- NA
  unlist(x)
})

output <-- do.call("rbind", json_file)
write.csv(a, file="json.csv",row.names = FALSE)
file.show("json.csv")

I'm trying to get my output into a CSV file like below

How exactly do you want to collapse this nested structure to a rectangular CSV file? What precisely would the output look like? You should include sample data in the question itself to make the problem reproducible. Pictures of data aren't as useful. — MrFlick
– MrFlick, Commented Jun 23, 2016 at 16:15
fromJSON(grep('/\\*.*\\*/', readLines(mydata.txt), invert = TRUE, value = TRUE)) gets rid of the comment lines, but still fails for me. — alistaire
– alistaire, Commented Jun 23, 2016 at 16:38

Stibu · Accepted Answer · 2016-06-28 21:07:57Z

11

There are several issues with your text file. As you already noticed, you need to remove the lines of the form /* 0 */. What results is still not valid json. If you want to have several json objects in a file, you need to store them in an array. The json objects are the parts that are closed in curly brakets, e.g.,

{
  "_id" : "93ccbdb6-8947-4687-8e12-edf4e40d6650",
  ...
  "totalRecords" : 0,
  "status" : "SUCCESS"
}

The structure of an array of objects is as follows:

[
  {
    ...
  },
  {
    ...
  }
]

To get your file in shape, you need to add a comma between the objects and add the square brackets. You could do this as follows:

raw <- readLines("mydata.txt")

# get rid of the "/* 0 */" lines
json <- grep("^/\\* [0-9]* \\*/", raw, value = TRUE, invert = TRUE)

# add missing comma after }
n <- length(json)
json[-n] <- gsub("^}$", "},", json[-n])

# add brakets at the beginning and end
json <- c("[", json, "]")

This can be read by fromJSON(), so I assume it is valid json:

library(jsonlite)
table <- fromJSON(json)

The table is nested, that is, some of the tables cells contain a data frame or a list themselves. For example,

table[1,2]
##   travelDate travelDuration shopperDuration oneWay  userId                              queryId
## 1   20151206              7              30  FALSE ATP1KKP 93ccbdb6-8947-4687-8e12-edf4e40d6650
##               subRequests
##     1 WAS, LON, AA, , 1,2

You could use flatten() from the jsonlite package, to get a table with one level of nesting less

flatten(table)[1:3, c(1, 6, 12)]
##                                    _id uiSearchRequest.travelDate uiSearchRequest.subRequests
## 1 93ccbdb6-8947-4687-8e12-edf4e40d6650                   20151206         WAS, LON, AA, , 1,2
## 2 b736c374-b8ae-4e99-8073-9c54517fecd5                   20151206         WAS, LON, AA, , 1,2
## 3 3312605f-8304-4ab8-96d6-6e1a03cfbd9e                   20151206         LON, IAD, AA, , 1,2

The last column is still a list. There are many ways you could handle this. One possibility, is to create a row per subrequest, where the contents of all the other columns (X_id, downloadCount, etc.) are repeated. (This is almost the form that you give in your question, with the only difference that you left cells empty in the reapeated columns, while I repeat the contents.) This is how it can be done:

table <- flatten(fromJSON(json))
tab_list <- lapply(1:nrow(table),
                  function(i) data.frame(table[i, -12], table[i, 12],
                              stringsAsFactors = FALSE))
library(dplyr)
flat_table <- bind_rows(tab_list)

The second line creates a list of data frames. These are combined into a single data frame using bind_rows() from dpylr. (To be more precise, flat_table will be a tbl_df, but the difference to a data.frame is small.) This can then be written to a csv file in the usual way:

write.csv(flat_table, file = "mydata.csv")

edited Jun 28, 2016 at 21:07

answered Jun 23, 2016 at 19:09

Stibu

16k6 gold badges61 silver badges76 bronze badges

Sign up to request clarification or add additional context in comments.

8 Comments

Lester Pereira Over a year ago

That worked fine @Stibu, thanks but it is part of the solution! I'm having problem exporting the contents of the table to .csv or .xlsx file. Do you know how should I fix this error? Error in .jcall(cell, "V", "setCellValue", value) : method setCellValue with signature ([Ljava/lang/String;)V not found In addition: Warning message: In if (is.na(value)) { : the condition has length > 1 and only the first element will be used

Stibu Over a year ago

The result of flatten(table) can indeed not be written to a csv file with write.csv(), because that function can not cope with the fact that one of the columns of the data frame is a list. I have added a few more lines of code that make sure that you really get a single data frame that can be written to a csv file.

Stibu Over a year ago

I'm not familiar with the error message in you comment. Since it mentions Java, I assume that you are using some package to create Excel files. So, you probably have an issue with how you use that package. Maybe the problem is also the list that is contained in the data frame and the problem will go away, once you use the additional code that I provided. If not, you should ask a new and specific question about your problem when creating Excel file (after you searched SO to check, whether the question has already been asked, of course...)

Lester Pereira Over a year ago

When I run tab_list <- lapply(1:nrow(table), function(i) data.frame(table[i, -12], table[i, 12], stringsAsFactors = FALSE)) I'm getting the following error: Error in data.frame(table[i, -12], table[i, 12], stringsAsFactors = FALSE) : arguments imply differing number of rows: 1, 0

Stibu Over a year ago

Well, apparently table <- fromJSON(json) did not work because you got an error message, that you wouldn't have had with table <- flatten(fromJSON(json)). The code in my answer gives the same result as your edit (apart from the column names and the order of the rows), but is much simpler.

|

showdev · Accepted Answer · 2019-12-05 09:33:15Z

2

It's very simple in Python:

import pandas as pd
data = pd.read_json(path_to_input_file)
data.to_csv(path_to_csv_output_file)

edited Dec 5, 2019 at 9:33

showdev

29.3k37 gold badges61 silver badges82 bronze badges

answered Dec 5, 2019 at 8:57

Pramod Shinde

1171 silver badge2 bronze badges

2 Comments

tospig Over a year ago

The question asks for a solution in R, not Pyton.

Pramod Shinde Over a year ago

Just now I was working on converting JSON to CSV using R. However, I was still struggling with the above suggestion and found some cool solution using Pandas. So thought to share it here. :)

Collectives™ on Stack Overflow

Convert JSON file to a CSV file using R

2 Answers 2

8 Comments

2 Comments

Your Answer

Linked

Hot Network Questions

Collectives™ on Stack Overflow

2 Answers 2

8 Comments

2 Comments

Your Answer

Sign up or log in

Post as a guest

Linked

Related