I'm learning Apache Spark after having not used it for quite a while and attempting to convert this MongoDB string :
{
"_id": {
"$oid": "601de7179acebcfb50c8f347"
},
"timestamp": {
"$numberLong": "1612572439411"
},
"newsdata": {
"test1": ["n1", "n2"],
"test2": ["n3", "n4"]
}
}
to a Spark dataframe using :
package sparkanalysis
import org.apache.spark.sql.SparkSession
object WordCount {
def main(args: Array[String]): Unit = {
val mongoString = "{\"_id\":{\"$oid\":\"601de7179acebcfb50c8f347\"}," +
"\"timestamp\":{\"$numberLong\":\"1612572439411\"}," +
"\"newsdata\":{" +
"\"test1\" : [\"n1\",\"n2\"]" +
",\"test2\" : [\"n3\",\"n4\"]}}"
print(mongoString)
val spark = SparkSession
.builder()
.appName("Spark SQL basic example")
.config("spark.some.config.option", "some-value")
.config("spark.master", "local[*]")
.getOrCreate()
val df = spark.read.json(mongoString)
println(df)
}
}
But I receive exception :
WARNING: All illegal access operations will be denied in a future release
Exception in thread "main" java.lang.IllegalArgumentException: java.net.URISyntaxException: Relative path in absolute URI: {"_id":%7B%22$oid%22:%22601de7179acebcfb50c8f347%22%7D,%22timestamp%22:%7B%22$numberLong%22:%221612572439411%22%7D,%22newsdata%22:%7B%22test1%22%20:%20%5B%22n1%22,%22n2%22%5D,%22test2%22%20:%20%5B%22n3%22,%22n4%22%5D%7D%7D
at org.apache.hadoop.fs.Path.initialize(Path.java:206)
at org.apache.hadoop.fs.Path.<init>(Path.java:172)
at org.apache.spark.sql.execution.datasources.DataSource.$anonfun$checkAndGlobPathIfNecessary$1(DataSource.scala:546)
at scala.collection.TraversableLike.$anonfun$flatMap$1(TraversableLike.scala:241)
at scala.collection.immutable.List.foreach(List.scala:389)
at scala.collection.TraversableLike.flatMap(TraversableLike.scala:241)
at scala.collection.TraversableLike.flatMap$(TraversableLike.scala:238)
at scala.collection.immutable.List.flatMap(List.scala:352)
at org.apache.spark.sql.execution.datasources.DataSource.checkAndGlobPathIfNecessary(DataSource.scala:545)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:359)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:223)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
at org.apache.spark.sql.DataFrameReader.json(DataFrameReader.scala:391)
at org.apache.spark.sql.DataFrameReader.json(DataFrameReader.scala:325)
at sparkanalysis.WordCount$.main(WordCount.scala:24)
at sparkanalysis.WordCount.main(WordCount.scala)
Caused by: java.net.URISyntaxException: Relative path in absolute URI: {"_id":%7B%22$oid%22:%22601de7179acebcfb50c8f347%22%7D,%22timestamp%22:%7B%22$numberLong%22:%221612572439411%22%7D,%22newsdata%22:%7B%22test1%22%20:%20%5B%22n1%22,%22n2%22%5D,%22test2%22%20:%20%5B%22n3%22,%22n4%22%5D%7D%7D
at java.base/java.net.URI.checkPath(URI.java:1940)
at java.base/java.net.URI.<init>(URI.java:757)
at org.apache.hadoop.fs.Path.initialize(Path.java:203)
... 15 more
Process finished with exit code 1
I've verified using https://jsonlint.com/ that the JSON is valid. Do I need to specify a case class in order to convert to dataframe correctly?