0

I have the following JSON from which I need to extract the full_address_bucket__ structures (present in ['hits']['hits']['_source']) into a Spark dataframe with Schema of full_address_bucket__ preserved using pyspark library

{
  "took" : 436,
  "timed_out" : false,
  "_shards" : {
    "total" : 3,
    "successful" : 3,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 1194393,
    "max_score" : 1.0,
    "hits" : [
      {
        "_index" : "epf_0001",
        "_type" : "document",
        "_id" : "PUKOL1964690000",
        "_score" : 1.0,
        "_source" : {
          "full_address_" : "Gat No 329, Tardal,Hatkanangle,KOLHAPUR,416109,MAHARASHTRA,India",
          "establishment_id" : "PUKOL1964690000",
          "full_address_bucket__" : {
            "care_of" : null,
            "adrs_long" : "gat no 329 , tardal, hatkanangle, kolhapur, 416109 , |,",
            "address" : "Gat No 329, Tardal,Hatkanangle,KOLHAPUR,416109,MAHARASHTRA,India",
            "city" : "kolhapur",
            "building_soundex" : null,
            "locality" : null,
            "complex_soundex" : null,
            "house" : null,
            "building" : null,
            "is_adrs_good" : true,
            "adrs_long_soundex" : "KT N TRTL HTKNNKL KLHPR",
            "pin" : "416109",
            "untagged" : "gat no 329, tardal, hatkanangle",
            "street" : null,
            "district" : "kolhapur",
            "complex" : null,
            "house_unsure" : "329",
            "state" : "maharashtra",
            "floor" : null,
            "landmark" : null
          }
        }
      },
      {
        "_index" : "epf_0001",
        "_type" : "document",
        "_id" : "PUPUN0002003000",
        "_score" : 1.0,
        "_source" : {
          "full_address_" : "TAL-LONAVALA,228,GRAM-RAVIVARMA TAL-LONAVALA,228,GRAM-RAVIVARMA,MALVALI,,410405,MAHARASHTRA,India",
          "establishment_id" : "PUPUN0002003000",
          "full_address_bucket__" : {
            "care_of" : null,
            "adrs_long" : "tal - lonavala, 228 , gram - ravivarma taluk - lonavala, 228 , gram - ravivarma, malvali, 410405 , |,",
            "address" : "TAL-LONAVALA,228,GRAM-RAVIVARMA TAL-LONAVALA,228,GRAM-RAVIVARMA,MALVALI,,410405,MAHARASHTRA,India",
            "city" : "lonavala",
            "building_soundex" : null,
            "locality" : null,
            "complex_soundex" : null,
            "house" : null,
            "building" : null,
            "is_adrs_good" : true,
            "adrs_long_soundex" : "TL LNFL KRM RFFRM TLK LNFL KRM RFFRM MLFL",
            "pin" : "410405",
            "untagged" : "tal - |, 228, gram - ravivarma - |, 228, -, malvali",
            "street" : null,
            "district" : "pune",
            "complex" : null,
            "house_unsure" : "228 228",
            "state" : "maharashtra",
            "floor" : null,
            "landmark" : null
          }
        }
      },
      {
        "_index" : "epf_0001",
        "_type" : "document",
        "_id" : "PUPUN0124020000",
        "_score" : 1.0,
        "_source" : {
          "full_address_" : "PLOT NO.9 S.NO. 105-108. PLOT NO.9 S.NO. 105-108.,PUNE,,411013,MAHARASHTRA,India",
          "establishment_id" : "PUPUN0124020000",
          "full_address_bucket__" : {
            "care_of" : null,
            "adrs_long" : "plot no . 9 survey no 105 - 108 . plot no . 9 survey no 105 - 108 ., pune, 411013 , |,",
            "address" : "PLOT NO.9 S.NO. 105-108. PLOT NO.9 S.NO. 105-108.,PUNE,,411013,MAHARASHTRA,India",
            "city" : "pune",
            "building_soundex" : null,
            "locality" : null,
            "complex_soundex" : null,
            "house" : "plot no . 9| survey no 105 - 108 . | plot no . 9| survey no 105 - 108",
            "building" : null,
            "is_adrs_good" : true,
            "adrs_long_soundex" : "PLT N SRF N PLT N SRF N PN",
            "pin" : "411013",
            "untagged" : null,
            "street" : null,
            "district" : "pune",
            "complex" : null,
            "house_unsure" : null,
            "state" : "maharashtra",
            "floor" : null,
            "landmark" : null
          }
        }
      }
    ]
  }
}

2 Answers 2

1

If you don't want to use the json as string and pull it from a location use the below code. It will directly create a Dataframe for full_address_bucket__

from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("Test").getOrCreate()

df = spark.read.format("json").option("inferSchema", "true").load("test14.json").select("hits.*")\
    .select(F.explode("hits").alias('hits')).select('hits.*').select("_source.*").select("full_address_bucket__.*")
df.show()

Output:
+--------------------+--------------------+--------------------+--------+----------------+-------+--------+-------+---------------+--------+-----+--------------------+------------+------------+--------+--------+------+-----------+------+--------------------+
|             address|           adrs_long|   adrs_long_soundex|building|building_soundex|care_of|    city|complex|complex_soundex|district|floor|               house|house_unsure|is_adrs_good|landmark|locality|   pin|      state|street|            untagged|
+--------------------+--------------------+--------------------+--------+----------------+-------+--------+-------+---------------+--------+-----+--------------------+------------+------------+--------+--------+------+-----------+------+--------------------+
|Gat No 329, Tarda...|gat no 329 , tard...|KT N TRTL HTKNNKL...|    null|            null|   null|kolhapur|   null|           null|kolhapur| null|                null|         329|        true|    null|    null|416109|maharashtra|  null|gat no 329, tarda...|
|TAL-LONAVALA,228,...|tal - lonavala, 2...|TL LNFL KRM RFFRM...|    null|            null|   null|lonavala|   null|           null|    pune| null|                null|     228 228|        true|    null|    null|410405|maharashtra|  null|tal - |, 228, gra...|
|PLOT NO.9 S.NO. 1...|plot no . 9 surve...|PLT N SRF N PLT N...|    null|            null|   null|    pune|   null|           null|    pune| null|plot no . 9| surv...|        null|        true|    null|    null|411013|maharashtra|  null|                null|
+--------------------+--------------------+--------------------+--------+----------------+-------+--------+-------+---------------+--------+-----+--------------------+------------+------------+--------+--------+------+-----------+------+--------------------+

Sign up to request clarification or add additional context in comments.

2 Comments

Please let me know if you made any changes to the json format, Amit. Because, my compiler is unable to read the json and shows corrupt record. Thanks btw.
Good catch. JSON file should be in a single line.
1
import json
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Test").getOrCreate()

json_data = """{
    "took" : 436,
    "timed_out": false,
    "_shards" : {
        "total" : 3,
        "successful" : 3,
        "skipped" : 0,
        "failed" : 0
    },
    "hits" : {
        "total" : 1194393,
        "max_score" : 1.0,
        "hits" : [
            {
                "_index" : "epf_0001",
                "_type" : "document",
                "_id" : "PUKOL1964690000",
                "_score" : 1.0,
                "_source" : {
                    "full_address_" : "Gat No 329, Tardal,Hatkanangle,KOLHAPUR,416109,MAHARASHTRA,India",
                    "establishment_id" : "PUKOL1964690000",
                    "full_address_bucket__" : {
                        "care_of" : null,
                        "adrs_long" : "gat no 329 , tardal, hatkanangle, kolhapur, 416109 , |,",
                        "address" : "Gat No 329, Tardal,Hatkanangle,KOLHAPUR,416109,MAHARASHTRA,India",
                        "city" : "kolhapur",
                        "building_soundex" : null,
                        "locality" : null,
                        "complex_soundex" : null,
                        "house" : null,
                        "building" : null,
                        "is_adrs_good" : true,
                        "adrs_long_soundex" : "KT N TRTL HTKNNKL KLHPR",
                        "pin" : "416109",
                        "untagged" : "gat no 329, tardal, hatkanangle",
                        "street" : null,
                        "district" : "kolhapur",
                        "complex" : null,
                        "house_unsure" : "329",
                        "state" : "maharashtra",
                        "floor" : null,
                        "landmark" : null
                    }
                }
            },
            {
                "_index" : "epf_0001",
                "_type" : "document",
                "_id" : "PUPUN0002003000",
                "_score" : 1.0,
                "_source" : {
                    "full_address_" : "TAL-LONAVALA,228,GRAM-RAVIVARMA TAL-LONAVALA,228,GRAM-RAVIVARMA,MALVALI,,410405,MAHARASHTRA,India",
                    "establishment_id" : "PUPUN0002003000",
                    "full_address_bucket__" : {
                        "care_of" : null,
                        "adrs_long" : "tal - lonavala, 228 , gram - ravivarma taluk - lonavala, 228 , gram - ravivarma, malvali, 410405 , |,",
                        "address" : "TAL-LONAVALA,228,GRAM-RAVIVARMA TAL-LONAVALA,228,GRAM-RAVIVARMA,MALVALI,,410405,MAHARASHTRA,India",
                        "city" : "lonavala",
                        "building_soundex" : null,
                        "locality" : null,
                        "complex_soundex" : null,
                        "house" : null,
                        "building" : null,
                        "is_adrs_good" : true,
                        "adrs_long_soundex" : "TL LNFL KRM RFFRM TLK LNFL KRM RFFRM MLFL",
                        "pin" : "410405",
                        "untagged" : "tal - |, 228, gram - ravivarma - |, 228, -, malvali",
                        "street" : null,
                        "district" : "pune",
                        "complex" : null,
                        "house_unsure" : "228 228",
                        "state" : "maharashtra",
                        "floor" : null,
                        "landmark" : null
                    }
                }
            },
            {
                "_index" : "epf_0001",
                "_type" : "document",
                "_id" : "PUPUN0124020000",
                "_score" : 1.0,
                "_source" : {
                    "full_address_" : "PLOT NO.9 S.NO. 105-108. PLOT NO.9 S.NO. 105-108.,PUNE,,411013,MAHARASHTRA,India",
                    "establishment_id" : "PUPUN0124020000",
                    "full_address_bucket__" : {
                        "care_of" : null,
                        "adrs_long" : "plot no . 9 survey no 105 - 108 . plot no . 9 survey no 105 - 108 ., pune, 411013 , |,",
                        "address" : "PLOT NO.9 S.NO. 105-108. PLOT NO.9 S.NO. 105-108.,PUNE,,411013,MAHARASHTRA,India",
                        "city" : "pune",
                        "building_soundex" : null,
                        "locality" : null,
                        "complex_soundex" : null,
                        "house" : "plot no . 9| survey no 105 - 108 . | plot no . 9| survey no 105 - 108",
                        "building" : null,
                        "is_adrs_good" : true,
                        "adrs_long_soundex" : "PLT N SRF N PLT N SRF N PN",
                        "pin" : "411013",
                        "untagged" : null,
                        "street" : null,
                        "district" : "pune",
                        "complex" : null,
                        "house_unsure" : null,
                        "state" : "maharashtra",
                        "floor" : null,
                        "landmark" : null
                    }
                }
            }
        ]
    }
}"""

json_obj = json.loads(json_data)

json_data1 = json_obj['hits']['hits'][0]['_source']['full_address_bucket__']

json_new = [json.dumps(json_data1)]
df = spark.read.json(spark.sparkContext.parallelize(json_new))
df.show()

Output:
+--------------------+--------------------+--------------------+--------+----------------+-------+--------+-------+---------------+--------+-----+-----+------------+------------+--------+--------+------+-----------+------+--------------------+
|             address|           adrs_long|   adrs_long_soundex|building|building_soundex|care_of|    city|complex|complex_soundex|district|floor|house|house_unsure|is_adrs_good|landmark|locality|   pin|      state|street|            untagged|
+--------------------+--------------------+--------------------+--------+----------------+-------+--------+-------+---------------+--------+-----+-----+------------+------------+--------+--------+------+-----------+------+--------------------+
|Gat No 329, Tarda...|gat no 329 , tard...|KT N TRTL HTKNNKL...|    null|            null|   null|kolhapur|   null|           null|kolhapur| null| null|         329|        true|    null|    null|416109|maharashtra|  null|gat no 329, tarda...|
+--------------------+--------------------+--------------------+--------+----------------+-------+--------+-------+---------------+--------+-----+-----+------------+------------+--------+--------+------+-----------+------+--------------------+


2 Comments

I will be reading the file from HDFS. For this reason I dont want to load the file into memory as a string. Is there any way in which I can retrieve only the full_address_bucket__ fields into the dataframe without loading them into memory
You can open the file in python and convert that as a json object then use it instead of using it like string.

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.