1

I have the following json loaded as a Dataframe:

root
 |-- transactions_analytics_data: struct (nullable = true)
 |    |-- cumulative_fraud_transactions: long (nullable = true)
 |    |-- cumulative_fraud_transactions_percent: double (nullable = true)
 |    |-- cumulative_transactions: long (nullable = true)
 |    |-- cumulative_transactions_percent: double (nullable = true)
 |    |-- false_predictions: struct (nullable = true)
 |    |    |-- no_of_false_negatives: long (nullable = true)
 |    |    |-- no_of_false_positives: long (nullable = true)
 |    |-- false_predictions_amount: struct (nullable = true)
 |    |    |-- predicted_false_negative: double (nullable = true)
 |    |    |-- predicted_false_positive: double (nullable = true)
 |    |-- fraud_transactions_barline: struct (nullable = true)
 |    |    |-- data: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |    |-- time: string (nullable = true)
 |    |    |    |    |-- totalFrauds: long (nullable = true)
 |    |-- fraud_transactions_map: struct (nullable = true)
 |    |    |-- California: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Connecticut: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Delaware: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Florida: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Georgia: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Hawaii: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Idaho: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Illinois: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Indiana: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Iowa: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Kansas: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Kentucky: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Louisiana: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Maine: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Maryland: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Massachusetts: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Michigan: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Minnesota: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Mississippi: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Missouri: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Montana: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Nebraska: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Nevada: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- New Hampshire: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- New Jersey: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- New Mexico: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- New York: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- North Carolina: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- North Dakota: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Ohio: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Oklahoma: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Oregon: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Pennsylvania: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Rhode Island: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- South Carolina: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- South Dakota: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Tennessee: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Texas: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Utah: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Vermont: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Virginia: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Washington: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |    |-- Wisconsin: struct (nullable = true)
 |    |    |    |-- falseNegatives: long (nullable = true)
 |    |    |    |-- falsePositives: long (nullable = true)
 |    |    |    |-- totalFrauds: long (nullable = true)
 |    |-- top10_affected_merchants: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- count: long (nullable = true)
 |    |    |    |-- date_affected: string (nullable = true)
 |    |    |    |-- merchange_category: string (nullable = true)

I wanted a DataFrame with each attribute as column, I tried the below for top10_affected_merchants which is working fine because it is array but it did not work on struct, please help me

val sqlContext = new org.apache.spark.sql.hive.HiveContext(sc)
import sqlContext.implicits._


val jsonDF = sqlContext.jsonFile("file:///root/sample.json")
jsonDF.printSchema()

///TOP 10 MARCHAENTS DATA SET

val top10_affected_merchantsDF = jsonDF.select(explode(jsonDF("transactions_analytics_data.top10_affected_merchants")))
top10_affected_merchantsDF.printSchema()
top10_affected_merchantsDF.registerTempTable("top10affectedmerchants")
val topaffectedmerchantsDF = sqlContext.sql("SELECT col.count, col.date_affected, col.merchange_category FROM top10affectedmerchants")
topaffectedmerchantsDF.printSchema()
topaffectedmerchantsDF.show()

1 Answer 1

1

This should resolve your issue.

val data = sqlContext.jsonFile("path/to/file")

val df = data.select(
                 $"transactions_analytics_data.fraud_transactions_barline.data".alias("trsns_anlytcs_data_barline_data"),
                 $"transactions_analytics_data.fraud_transactions_map.California.falseNegatives".alias("cali_falseNegatives"),
                 $"transactions_analytics_data.false_predictions.no_of_false_negatives",
                 $"transactions_analytics_data.false_predictions.no_of_false_positives",
                 $"transactions_analytics_data.top10_affected_merchants".alias("top10"),
                 $"transactions_analytics_data.cumulative_fraud_transactions")

val df1 = df.select($"trsns_anlytcs_data_barline_data",
                    $"cali_falseNegatives",
                    $"no_of_false_negatives",$"no_of_false_positives",
                    $"cumulative_fraud_transactions",explode($"top10").alias("top10"))
            .select($"trsns_anlytcs_data_barline_data",$"top10.count".alias("top10_count"),
                    $"top10.date_affected".alias("top10_date_affected"),
                    $"cali_falseNegatives",$"no_of_false_negatives",
                    $"no_of_false_positives",$"cumulative_fraud_transactions")

val df2 = df1.select(explode($"trsns_anlytcs_data_barline_data").alias("barline_data"),
                     $"trsns_anlytcs_data_data",$"top10_count",$"top10_date_affected",
                     $"cali_falseNegatives",$"no_of_false_negatives",
                     $"no_of_false_positives",$"cumulative_fraud_transactions")
             .select("barline_data.falseNegatives","barline_data.totalFrauds",
                     "trsns_anlytcs_data_data")

So if you have a nested fields with just struct without an array, you cannot explode them.

root
|-- col: struct (nullable = true)
|    |-- col1: long (nullable = true)
|    |-- col2: struct (nullable = true)
|    |    |-- col_1_1: long
|    |-- col3: struct (nullable = true)
|    |    |-- col3_1: array (nullable = true)
|    |    |    |-- col3_2: struct (containsNull = true)
|    |    |    |    |-- col3_3: long (nullable = true)
|    |    |    |    |-- col3_4: string (nullable = true)

From above data,to read col_1_1(nested struct field) you have to just write it in below format

"col.col2.col3"

If trying to read col3_3 or col3_4, your select statement will look like this

val DF = data.select($"col.col3.col3_1") //col3_1 will be name of resulting field
             .select(explode($"col3_1").alias("col3_1")) //exploding col3_1 as it is a array struct

Just be mindful that since you have a lot of similar named fields within nested attributes , make sure to rename your attributes properly.

Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.