raw_data = spark.createDataFrame(
[
('1',20),
('2',34),
('3',12)
], ['foo','bar'])
#columns I want to extract from raw_dataframe
extractColumns = ['refsnp_id', 'chr_name', 'chrom_start', 'chrom_end', 'version']
import pyspark.sql.functions as F
new_raw_data = raw_data
for col in extractColumns:
if col not in raw_data.columns:
new_raw_data = new_raw_data.withColumn(col, F.lit(None))\
new_raw_data.show()
+---+---+---------+--------+-----------+---------+-------+
|foo|bar|refsnp_id|chr_name|chrom_start|chrom_end|version|
+---+---+---------+--------+-----------+---------+-------+
| 1| 20| null| null| null| null| null|
| 2| 34| null| null| null| null| null|
| 3| 12| null| null| null| null| null|
+---+---+---------+--------+-----------+---------+-------+
from pyspark.sql.functions import *raw_data.select(['refsnp_id', 'chr_name', 'chrom_start', 'chrom_end']).withColumn("version", lit(None))