I have the the data in a JSON file (just showing the first row)
{
"cd_created_date": "2021-10-05T21:33:39.480933",
"cd_jurisdiction": "PROBATE",
"cd_last_modified": "2021-10-05T21:35:04.061105",
"cd_last_state_modified_date": "2021-10-05T21:35:04.060968",
"cd_latest_state": "WillWithdrawn",
"cd_reference": 1633469619443286,
"cd_security_classification": "PUBLIC",
"cd_version": 7,
"ce_case_data_id": 3483511,
"ce_case_type_id": "WillLodgement",
"ce_case_type_version": 170,
"ce_created_date": "2021-10-05T21:33:51.189872",
"ce_data": "{\"willDate\": \"1950-01-01\", \"jointWill\": \"Yes\", \"lodgedDate\": \"1970-03-03\", \"codicilDate\": \"1962-02-02\", \"executorTitle\": \"Mr\", \"lodgementType\": \"safeCustody\", \"deceasedGender\": \"male\", \"applicationType\": \"Personal\", \"deceasedAddress\": {\"County\": \"London\", \"Country\": \"United Kingdom\", \"PostCode\": \"SW1A 1AA\", \"PostTown\": \"London\", \"AddressLine1\": \"1\", \"AddressLine2\": \"Buckingham Palace\", \"AddressLine3\": \"The place to be\"}, \"deceasedSurname\": \"E2E_deceased_surname_1633469477956\", \"executorAddress\": {\"County\": \"London\", \"Country\": \"United Kingdom\", \"PostCode\": \"SW1A 1AA\", \"PostTown\": \"London\", \"AddressLine1\": \"1\", \"AddressLine2\": \"Buckingham Palace\", \"AddressLine3\": \"The place to be\"}, \"executorSurname\": \"executor1_surname\", \"numberOfCodicils\": \"3\", \"registryLocation\": \"Liverpool\", \"deceasedForenames\": \"E2E_deceased_forenames_1633469477956\", \"documentsUploaded\": [{\"id\": \"b1181bfb-d0a7-49d8-8301-b06e58eb42c1\", \"value\": {\"Comment\": \"test file to upload\", \"DocumentLink\": {\"document_url\": \"http://dm-store-aat.service.core-compute-aat.internal/documents/60cd0a78-648e-4af7-9f83-15a380f1786d\", \"document_filename\": \"test_file_for_document_upload.png\", \"document_binary_url\": \"http://dm-store-aat.service.core-compute-aat.internal/documents/60cd0a78-648e-4af7-9f83-15a380f1786d/binary\"}, \"DocumentType\": \"email\"}}], \"executorForenames\": \"executor1_forenames\", \"deceasedDateOfBirth\": \"1930-01-01\", \"deceasedDateOfDeath\": \"2017-01-01\", \"deceasedTypeOfDeath\": \"diedOnOrAbout\", \"deceasedEmailAddress\": \"[email protected]\", \"executorEmailAddress\": \"[email protected]\", \"deceasedAnyOtherNames\": \"Yes\", \"additionalExecutorList\": [{\"id\": \"bd8d7ca2-ed84-424c-a241-40fd99b15596\", \"value\": {\"executorTitle\": \"Dr\", \"executorAddress\": {\"County\": \"London\", \"Country\": \"United Kingdom\", \"PostCode\": \"SW1A 1AA\", \"PostTown\": \"London\", \"AddressLine1\": \"1\", \"AddressLine2\": \"Buckingham Palace\", \"AddressLine3\": \"The place to be\"}, \"executorSurname\": \"executor2_surname\", \"executorForenames\": \"executor2_forenames\", \"executorEmailAddress\": \"[email protected]\"}}], \"deceasedFullAliasNameList\": [{\"id\": \"1970ac9d-532e-48f2-8851-04ae6eec973f\", \"value\": {\"FullAliasName\": \"deceased_alias1_1633469477956\"}}, {\"id\": \"86c842aa-e10f-44d5-8c28-a7ce8a1cb0eb\", \"value\": {\"FullAliasName\": \"deceased_alias2\"}}]}",
"ce_description": "upload_document_event_description_text",
"ce_event_id": "uploadDocument",
"ce_event_name": "Upload document",
"ce_id": 30638630,
"ce_security_classification": "PUBLIC",
"ce_state_id": "WillLodgementCreated",
"ce_state_name": "Will lodgement created",
"ce_summary": "upload_document_event_summary_text",
"ce_user_first_name": "Probate",
"ce_user_id": "349978",
"ce_user_last_name": "Backoffice",
"extraction_date": "2021-10-06"
}
AS you can see the field ce_data contains an array.
When I read in the JSON in Apache Spark with Databricks I get the following printSchema()
root
|-- cd_created_date: string (nullable = true)
|-- cd_jurisdiction: string (nullable = true)
|-- cd_last_modified: string (nullable = true)
|-- cd_last_state_modified_date: string (nullable = true)
|-- cd_latest_state: string (nullable = true)
|-- cd_reference: long (nullable = true)
|-- cd_security_classification: string (nullable = true)
|-- cd_version: long (nullable = true)
|-- ce_case_data_id: long (nullable = true)
|-- ce_case_type_id: string (nullable = true)
|-- ce_case_type_version: long (nullable = true)
|-- ce_created_date: string (nullable = true)
**|-- ce_data: string (nullable = true)**
|-- ce_description: string (nullable = true)
|-- ce_event_id: string (nullable = true)
|-- ce_event_name: string (nullable = true)
|-- ce_id: long (nullable = true)
|-- ce_security_classification: string (nullable = true)
|-- ce_state_id: string (nullable = true)
|-- ce_state_name: string (nullable = true)
|-- ce_summary: string (nullable = true)
|-- ce_user_first_name: string (nullable = true)
|-- ce_user_id: string (nullable = true)
|-- ce_user_last_name: string (nullable = true)
|-- extraction_date: string (nullable = true)
As you can see from the above PrintSchema in Databricks the field ce_data does not show up as an array.
However, I would like to query the arrays in the ce_data field. For example I would like to write a query that founds a lodgeDate = 1970-03-03?
My attempt would be something like
test = spark.sql("""select ce_data from testtable where ce_data.lodgeDate = '1970-03-03'""")
The error I'm getting when I enter the above code in Databricks is:
Can't extract value from ce_data#12747: need struct type but got string;
So, I would first need to understand why I'm not seeing the arrays in the printSchema(), however my main question is how to query arrays in JSON using sparkSQL.
I'm also wondering if I need to import some libraries?