I have some trouble printing out this query with the months sorted in proper order.
Is there a pyspark function command to format the month column in descending order? (without using sql commands)
from pyspark import SparkContext
from pyspark.sql import SQLContext
from operator import add
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)
rows = sc.textFile("data.csv")
data = rows.map(lambda line: line.split(","))
header = data.first()
q = data.filter(lambda line: line != header)\
.map(lambda x:(x[1], 1))\
.reduceByKey(add)\
.sortByKey(True)\
.collect()
sqlContext.createDataFrame(q, ['Month','Total number of operated flights']).show()
+-----+--------------------------------+
|Month|Total number of operated flights|
+-----+--------------------------------+
| 1| 621559|
| 10| 629992|
| 11| 605149|
| 12| 614139|
| 2| 565604|
| 3| 639209|
| 4| 614648|
| 5| 631609|
| 6| 629280|
| 7| 648560|
| 8| 653279|
| 9| 600187|
+-----+--------------------------------+